diff --git a/.gitmodules b/.gitmodules
index e69de29bb2d..605fac63cc4 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -0,0 +1,12 @@
+[submodule "cpp/nvgraph/cpp/thirdparty/cnmem"]
+	path = cpp/nvgraph/cpp/thirdparty/cnmem
+	url = https://github.com/NVIDIA/cnmem.git
+	branch = master
+[submodule "cpp/nvgraph/cpp/thirdparty/cub"]
+	path = cpp/nvgraph/cpp/thirdparty/cub
+	url = https://github.com/NVlabs/cub.git
+	branch = 1.8.0
+[submodule "cpp/nvgraph/external/cusp"]
+	path = cpp/nvgraph/external/cusp
+	url = https://github.com/cusplibrary/cusplibrary.git
+	branch = cuda9
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5d16706725f..8cce3b51853 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,10 +7,10 @@
 ## Improvements
 - PR #157 Removed cudatoolkit dependency in setup.py
 - PR #185 Update docs version
+- PR #194 Open source nvgraph in cugraph repository #194
 - PR #190 Added a copy option in graph creation
 - PR #196 Fix typos in readme intro
 
-
 ## Bug Fixes
 - PR #169 Disable terminal output in sssp 
 - PR #191 Fix double upload bug
diff --git a/README.md b/README.md
index db970371aae..10cb81027fd 100644
--- a/README.md
+++ b/README.md
@@ -192,7 +192,7 @@ conda activate cugraph_dev
 
 3) Build and install `libcugraph`. CMake depends on the `nvcc` executable being on your path or defined in `$CUDACXX`.
 
-  This project uses cmake for building the C/C++ library. To configure cmake, run:
+  This project uses cmake for building the C/C++ library. CMake will also automatically build and install nvGraph library (`$CUGRAPH_HOME/cpp/nvgraph`) which may take a few minutes. To configure cmake, run:
 
   ```bash
   # Set the localtion to cuGraph in an environment variable CUGRAPH_HOME 
@@ -320,8 +320,9 @@ unset LD_LIBRARY_PATH
 
 
 
+## nvGraph
 
-
+The nvGraph library is now open source and part of cuGraph. It can be build as a stand alone by following nvgraph's [readme](cpp/nvgraph/). 
 
 
 ------
diff --git a/ci/cpu/cugraph/upload-anaconda.sh b/ci/cpu/cugraph/upload-anaconda.sh
index 7ea8c7069ac..19461c77d4f 100755
--- a/ci/cpu/cugraph/upload-anaconda.sh
+++ b/ci/cpu/cugraph/upload-anaconda.sh
@@ -4,26 +4,18 @@
 
 set -e
 
-if [ "$BUILD_CUGRAPH" == "1" ]; then
-  if [ "$BUILD_ABI" == "1" ]; then
-    export UPLOADFILE=`conda build conda/recipes/cugraph -c rapidsai -c nvidia -c numba -c conda-forge -c defaults --python=$PYTHON --output`
-  else
-    export UPLOADFILE=`conda build conda/recipes/cugraph -c rapidsai/label/cf201901 -c nvidia/label/cf201901 -c numba -c conda-forge/label/cf201901 -c defaults --python=$PYTHON --output`
-  fi
+if [ "$UPLOAD_CUGRAPH" == "1" ]; then
+  export UPLOADFILE=`conda build conda/recipes/cugraph -c rapidsai -c nvidia -c numba -c conda-forge -c defaults --python=$PYTHON --output`
 
   SOURCE_BRANCH=master
 
   # Have to label all CUDA versions due to the compatibility to work with any CUDA
-  if [ "$LABEL_MAIN" == "1" -a "$BUILD_ABI" == "1" ]; then
+  if [ "$LABEL_MAIN" == "1" ]; then
     LABEL_OPTION="--label main --label cuda9.2 --label cuda10.0"
-  elif [ "$LABEL_MAIN" == "0" -a "$BUILD_ABI" == "1" ]; then
+  elif [ "$LABEL_MAIN" == "0" ]; then
     LABEL_OPTION="--label dev --label cuda9.2 --label cuda10.0"
-  elif [ "$LABEL_MAIN" == "1" -a "$BUILD_ABI" == "0" ]; then
-    LABEL_OPTION="--label cf201901 --label cf201901-cuda9.2 --label cf201901-cuda10.0"
-  elif [ "$LABEL_MAIN" == "0" -a "$BUILD_ABI" == "0" ]; then
-    LABEL_OPTION="--label cf201901-dev --label cf201901-cuda9.2 --label cf201901-cuda10.0"
   else
-    echo "Unknown label configuration LABEL_MAIN='$LABEL_MAIN' BUILD_ABI='$BUILD_ABI'"
+    echo "Unknown label configuration LABEL_MAIN='$LABEL_MAIN'"
     exit 1
   fi
   echo "LABEL_OPTION=${LABEL_OPTION}"
@@ -44,4 +36,7 @@ if [ "$BUILD_CUGRAPH" == "1" ]; then
   echo "Upload"
   echo ${UPLOADFILE}
   anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --force ${UPLOADFILE}
+else
+    echo "Skipping cugraph upload"
+    return 0
 fi
\ No newline at end of file
diff --git a/ci/cpu/libcugraph/upload-anaconda.sh b/ci/cpu/libcugraph/upload-anaconda.sh
index 6c66a3169f0..c66c8cae137 100755
--- a/ci/cpu/libcugraph/upload-anaconda.sh
+++ b/ci/cpu/libcugraph/upload-anaconda.sh
@@ -4,31 +4,23 @@
 
 set -e
 
-if [ "$BUILD_LIBCUGRAPH" == "1" ]; then
+if [ "$UPLOAD_LIBCUGRAPH" == "1" ]; then
   CUDA_REL=${CUDA:0:3}
   if [ "${CUDA:0:2}" == '10' ]; then
     # CUDA 10 release
     CUDA_REL=${CUDA:0:4}
   fi
   
-  if [ "$BUILD_ABI" == "1" ]; then
-    export UPLOADFILE=`conda build conda/recipes/libcugraph -c rapidsai -c nvidia -c numba -c conda-forge -c defaults --python=$PYTHON --output`
-  else
-    export UPLOADFILE=`conda build conda/recipes/libcugraph -c rapidsai/label/cf201901 -c nvidia/label/cf201901 -c numba -c conda-forge/label/cf201901 -c defaults --python=$PYTHON --output`
-  fi
+  export UPLOADFILE=`conda build conda/recipes/libcugraph -c rapidsai -c nvidia -c numba -c conda-forge -c defaults --python=$PYTHON --output`
 
   SOURCE_BRANCH=master
 
-  if [ "$LABEL_MAIN" == "1" -a "$BUILD_ABI" == "1" ]; then
+  if [ "$LABEL_MAIN" == "1" ]; then
     LABEL_OPTION="--label main --label cuda${CUDA_REL}"
-  elif [ "$LABEL_MAIN" == "0" -a "$BUILD_ABI" == "1" ]; then
+  elif [ "$LABEL_MAIN" == "0" ]; then
     LABEL_OPTION="--label dev --label cuda${CUDA_REL}"
-  elif [ "$LABEL_MAIN" == "1" -a "$BUILD_ABI" == "0" ]; then
-    LABEL_OPTION="--label cf201901 --label cf201901-cuda${CUDA_REL}"
-  elif [ "$LABEL_MAIN" == "0" -a "$BUILD_ABI" == "0" ]; then
-    LABEL_OPTION="--label cf201901-dev --label cf201901-cuda${CUDA_REL}"
   else
-    echo "Unknown label configuration LABEL_MAIN='$LABEL_MAIN' BUILD_ABI='$BUILD_ABI'"
+    echo "Unknown label configuration LABEL_MAIN='$LABEL_MAIN'"
     exit 1
   fi
   echo "LABEL_OPTION=${LABEL_OPTION}"
@@ -49,4 +41,6 @@ if [ "$BUILD_LIBCUGRAPH" == "1" ]; then
   echo "Upload"
   echo ${UPLOADFILE}
   anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --force ${UPLOADFILE}
+else
+    echo "Skipping libcugraph upload"
 fi
diff --git a/ci/cpu/prebuild.sh b/ci/cpu/prebuild.sh
index c7303a1a298..1a1c2a69064 100644
--- a/ci/cpu/prebuild.sh
+++ b/ci/cpu/prebuild.sh
@@ -1,15 +1,17 @@
 #!/usr/bin/env bash
 
 export BUILD_ABI=1
+export BUILD_CUGRAPH=1
+export BUILD_LIBCUGRAPH=1
 
 if [[ "$CUDA" == "9.2" ]]; then
-    export BUILD_CUGRAPH=1
+    export UPLOAD_CUGRAPH=1
 else
-    export BUILD_CUGRAPH=0
+    export UPLOAD_CUGRAPH=0
 fi
 
 if [[ "$PYTHON" == "3.6" ]]; then
-    export BUILD_LIBCUGRAPH=1
+    export UPLOAD_LIBCUGRAPH=1
 else
-    export BUILD_LIBCUGRAPH=0
+    export UPLOAD_LIBCUGRAPH=0
 fi
diff --git a/conda/environments/cugraph_dev.yml b/conda/environments/cugraph_dev.yml
index efd1c15097c..3323a846901 100644
--- a/conda/environments/cugraph_dev.yml
+++ b/conda/environments/cugraph_dev.yml
@@ -7,7 +7,6 @@ channels:
 - defaults
 dependencies:
 - cudf>=0.5.1
-- nvgraph
 - scipy
 - networkx
 - python-louvain
diff --git a/conda/environments/cugraph_dev_cuda10.yml b/conda/environments/cugraph_dev_cuda10.yml
index 7168452a843..51a114ccc29 100644
--- a/conda/environments/cugraph_dev_cuda10.yml
+++ b/conda/environments/cugraph_dev_cuda10.yml
@@ -7,7 +7,6 @@ channels:
 - defaults
 dependencies:
 - cudf>=0.5.1
-- nvgraph
 - scipy
 - networkx
 - python-louvain
diff --git a/conda/environments/cugraph_nightly.yml b/conda/environments/cugraph_nightly.yml
index 65ba1a1fa84..e5ee17033c7 100644
--- a/conda/environments/cugraph_nightly.yml
+++ b/conda/environments/cugraph_nightly.yml
@@ -8,7 +8,6 @@ channels:
 - defaults
 dependencies:
 - cudf=0.6
-- nvgraph
 - scipy
 - networkx
 - python-louvain
diff --git a/conda/environments/cugraph_nightly_cuda10.yml b/conda/environments/cugraph_nightly_cuda10.yml
index 5097c044a54..d8070b78793 100644
--- a/conda/environments/cugraph_nightly_cuda10.yml
+++ b/conda/environments/cugraph_nightly_cuda10.yml
@@ -8,7 +8,6 @@ channels:
 - defaults
 dependencies:
 - cudf=0.6
-- nvgraph
 - scipy
 - networkx
 - python-louvain
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index af3122d0f19..15da0bc8396 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -129,6 +129,11 @@ message(STATUS "CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
 # - cmake custom modules --------------------------------------------------------------------------
 include(ConfigureGoogleTest)
 
+# speedup build time by avoiding features that are not exposed
+set(NVGRAPH_LIGHT True)
+# build nvgraph
+include(ConfigureNvgraph)
+
 ###################################################################################################
 # - Find and add different modules and supporting repos -------------------------------------------
 find_package(Boost 1.45.0 COMPONENTS system)
@@ -186,12 +191,11 @@ endif (RMM_INCLUDE AND RMM_LIBRARY)
 
 ###################################################################################################
 # - add nvgraph -----------------------------------------------------------------------------------
+
 find_path(NVGRAPH_INCLUDE "nvgraph"
-      HINTS "$ENV{NVGRAPH_ROOT}/include"
-            "$ENV{CONDA_PREFIX}/include")
-find_library(NVGRAPH_LIBRARY "nvgraph_st"
-         HINTS "$ENV{NVGRAPH_ROOT}/lib"
-               "$ENV{CONDA_PREFIX}/lib")
+      HINTS "$ENV{CONDA_PREFIX}/include")
+find_library(NVGRAPH_LIBRARY "nvgraph_rapids"
+         HINTS "$ENV{CONDA_PREFIX}/lib")
 
 add_library( nvgraph SHARED IMPORTED)
 if (NVGRAPH_INCLUDE AND NVGRAPH_LIBRARY)
diff --git a/cpp/cmake/Modules/ConfigureNvgraph.cmake b/cpp/cmake/Modules/ConfigureNvgraph.cmake
new file mode 100644
index 00000000000..16d9a55dbe8
--- /dev/null
+++ b/cpp/cmake/Modules/ConfigureNvgraph.cmake
@@ -0,0 +1,65 @@
+set(NVGRAPH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/nvgraph")
+
+set(NVGRAPH_CMAKE_ARGS "")
+                     #" -DNVGRAPH_build_samples=ON" 
+                     #" -DCMAKE_VERBOSE_MAKEFILE=ON")
+
+if(NOT CMAKE_CXX11_ABI)
+    message(STATUS "NVGRAPH: Disabling the GLIBCXX11 ABI")
+    list(APPEND NVGRAPH_CMAKE_ARGS " -DCMAKE_C_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=0")
+    list(APPEND NVGRAPH_CMAKE_ARGS " -DCMAKE_CXX_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=0")
+elseif(CMAKE_CXX11_ABI)
+    message(STATUS "NVGRAPH: Enabling the GLIBCXX11 ABI")
+    list(APPEND NVGRAPH_CMAKE_ARGS " -DCMAKE_C_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=1")
+    list(APPEND NVGRAPH_CMAKE_ARGS " -DCMAKE_CXX_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=1")
+endif(NOT CMAKE_CXX11_ABI)
+
+#configure_file("${CMAKE_SOURCE_DIR}/cmake/Templates/Nvgraph.CMakeLists.txt.cmake"
+#               "${NVGRAPH_ROOT}/cpp/CMakeLists.txt")
+
+file(MAKE_DIRECTORY "${NVGRAPH_ROOT}/cpp/build")
+#file(MAKE_DIRECTORY "${NVGRAPH_ROOT}/install")
+
+execute_process(COMMAND ${CMAKE_COMMAND} -G ${CMAKE_GENERATOR} .. -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX} -DNVGRAPH_LIGHT=${NVGRAPH_LIGHT}
+                RESULT_VARIABLE NVGRAPH_CONFIG
+                WORKING_DIRECTORY ${NVGRAPH_ROOT}/cpp/build)
+
+if(NVGRAPH_CONFIG)
+    message(FATAL_ERROR "Configuring nvgraph failed: " ${NVGRAPH_CONFIG})
+endif(NVGRAPH_CONFIG)
+
+set(PARALLEL_BUILD -j)
+if($ENV{PARALLEL_LEVEL})
+    set(NUM_JOBS $ENV{PARALLEL_LEVEL})
+    set(PARALLEL_BUILD "${PARALLEL_BUILD}${NUM_JOBS}")
+endif($ENV{PARALLEL_LEVEL})
+
+if(${NUM_JOBS})
+    if(${NUM_JOBS} EQUAL 1)
+        message(STATUS "NVGRAPH BUILD: Enabling Sequential CMake build")
+    elseif(${NUM_JOBS} GREATER 1)
+        message(STATUS "NVGRAPH BUILD: Enabling Parallel CMake build with ${NUM_JOBS} jobs")
+    endif(${NUM_JOBS} EQUAL 1)
+else()
+    message(STATUS "NVGRAPH BUILD: Enabling Parallel CMake build with all threads")
+endif(${NUM_JOBS})
+
+execute_process(COMMAND ${CMAKE_COMMAND} --build . -- ${PARALLEL_BUILD}
+                RESULT_VARIABLE NVGRAPH_BUILD
+                WORKING_DIRECTORY ${NVGRAPH_ROOT}/cpp/build)
+if(NVGRAPH_BUILD)
+    message(FATAL_ERROR "Building nvgraph failed: " ${NVGRAPH_BUILD})
+endif(NVGRAPH_BUILD)
+
+execute_process(COMMAND ${CMAKE_COMMAND} --build . --target install 
+                RESULT_VARIABLE NVGRAPH_BUILD
+                WORKING_DIRECTORY ${NVGRAPH_ROOT}/cpp/build)
+
+if(NVGRAPH_BUILD)
+    message(FATAL_ERROR "Installing nvgraph failed: " ${NVGRAPH_BUILD})
+endif(NVGRAPH_BUILD)
+
+message(STATUS "nvgraph installed under: " ${CMAKE_INSTALL_PREFIX})
+set(NVGRAPH_INCLUDE "${CMAKE_INSTALL_PREFIX}/include/nvgraph.h ${CMAKE_INSTALL_PREFIX}/include/test_opt_utils.cuh")
+set(NVGRAPH_LIBRARY "${CMAKE_INSTALL_PREFIX}/lib/libnvgraph_rapids.so")
+set(NVGRAPH_FOUND TRUE)
diff --git a/cpp/nvgraph/README.md b/cpp/nvgraph/README.md
new file mode 100644
index 00000000000..173b18e4cbb
--- /dev/null
+++ b/cpp/nvgraph/README.md
@@ -0,0 +1,103 @@
+# <div align="left"><img src="../../img/rapids_logo.png" width="90px"/>&nbsp;nvgraph - NVIDIA graph library</div>
+
+Data analytics is a growing application of high-performance computing. Many advanced data analytics problems can be couched as graph problems. In turn, many of the common graph problems today can be couched as sparse linear algebra. This is the motivation for nvGRAPH, which harnesses the power of GPUs for linear algebra to handle large graph analytics and big data analytics problems.
+
+## Development Setup
+
+### Conda{#conda}
+
+It is easy to install nvGraph using conda. You can get a minimal conda installation with [Miniconda](https://conda.io/miniconda.html) or get the full installation with [Anaconda](https://www.anaconda.com/download).
+
+Install and update nvGraph using the conda command:
+
+```bash
+# CUDA 9.2
+conda install -c nvidia nvgraph
+
+# CUDA 10.0
+conda install -c nvidia/label/cuda10.0 nvgraph 
+
+```
+
+Note: This conda installation only applies to Linux and Python versions 3.6/3.7.
+
+### Build from Source {#source}
+
+The following instructions are for developers and contributors to nvGraph OSS development. These instructions are tested on Linux Ubuntu 16.04 & 18.04. Use these instructions to build nvGraph from source and contribute to its development.  Other operating systems may be compatible, but are not currently tested.
+
+The nvGraph package is a C/C++ CUDA library. It needs to be installed in order for nvGraph to operate correctly.  
+
+The following instructions are tested on Linux systems.
+
+
+#### Prerequisites
+
+Compiler requirement:
+
+* `gcc`     version 5.4+
+* `nvcc`    version 9.2
+* `cmake`   version 3.12
+
+
+
+CUDA requirement:
+
+* CUDA 9.2+
+* NVIDIA driver 396.44+
+* Pascal architecture or better
+
+You can obtain CUDA from [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads).
+Compiler requirements:
+
+
+#### Build and Install the C/C++ CUDA components
+
+To install cuGraph from source, ensure the dependencies are met and follow the steps below:
+
+1) Clone the repository and submodules
+
+  ```bash
+  # Set the localtion to cuGraph in an environment variable CUGRAPH_HOME 
+  export CUGRAPH_HOME=$(pwd)/cugraph
+
+  # Download the cuGraph repo
+  git clone https://github.com/rapidsai/cugraph.git $CUGRAPH_HOME
+
+  # Next load all the submodules
+  cd $CUGRAPH_HOME
+  git submodule update --init --recursive
+  ```
+
+2) Build and install `libnvgraph_rapids.so`. CMake depends on the `nvcc` executable being on your path or defined in `$CUDACXX`.
+
+  This project uses cmake for building the C/C++ library. To configure cmake, run:
+
+  ```bash
+  cd $CUGRAPH_HOME
+  cd cpp/nvgraph/cpp	# enter nvgraph's cpp directory
+  mkdir build   		# create build directory 
+  cd build     		# enter the build directory
+  cmake .. -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX 
+
+  # now build the code
+  make -j				# "-j" starts multiple threads
+  make install		# install the libraries 
+  ```
+
+The default installation  locations are `$CMAKE_INSTALL_PREFIX/lib` and `$CMAKE_INSTALL_PREFIX/include/nvgraph` respectively.
+
+#### C++ stand alone tests
+
+```bash
+# Run the cugraph tests
+cd $CUGRAPH_HOME
+cd cpp/nvgraph/cpp/build
+gtests/NVGRAPH_TEST # this is an executable file
+```
+Other test executables require specific datasets and will result in failure if they are not present.
+## Documentation
+
+The C API documentation can be found in the [CUDA Toolkit Documentation](https://docs.nvidia.com/cuda/nvgraph/index.html).
+
+
+
diff --git a/cpp/nvgraph/conda-recipes/nvgraph/LICENSE b/cpp/nvgraph/conda-recipes/nvgraph/LICENSE
new file mode 100644
index 00000000000..d8708b3facc
--- /dev/null
+++ b/cpp/nvgraph/conda-recipes/nvgraph/LICENSE
@@ -0,0 +1,152 @@
+LICENSE AGREEMENT FOR NVIDIA SOFTWARE DEVELOPMENT KITS
+(July 26, 2018 version)
+
+This license agreement, including exhibits attached ("Agreement”) is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of a NVIDIA software development kit (“SDK”).
+
+Each SDK has its own set of software and materials, but here is a description of the types of items that may be included in a SDK: source code, header files, APIs, data sets and assets (examples include images, textures, models, scenes, videos, native API input/output files), binary software, sample code, libraries, utility programs, programming code and documentation.
+
+This Agreement can be accepted only by an adult of legal age of majority in the country in which the SDK is used.
+
+If you are entering into this Agreement on behalf of a company or other legal entity, you represent that you have the legal authority to bind the entity to this Agreement, in which case “you” will mean the entity you represent.
+
+If you don’t have the required age or authority to accept this Agreement, or if you don’t accept all the terms and conditions of this Agreement, do not download, install or use the SDK.
+
+You agree to use the SDK only for purposes that are permitted by (a) this Agreement, and (b) any applicable law, regulation or generally accepted practices or guidelines in the relevant jurisdictions.
+
+1.	License.
+
+1.1	Grant
+
+Subject to the terms of this Agreement, NVIDIA hereby grants you a non-exclusive, non-transferable license, without the right to sublicense (except as expressly provided in this Agreement) to:
+
+(i)	Install and use the SDK,
+
+(ii)	Modify and create derivative works of sample source code delivered in the SDK, and
+
+(iii)	Distribute those portions of the SDK that are identified in this Agreement as distributable, as incorporated in object code format into a software application that meets the distribution requirements indicated in this Agreement.
+
+1.2 Distribution Requirements
+
+These are the distribution requirements for you to exercise the distribution grant:
+
+(i)	Your application must have material additional functionality, beyond the included portions of the SDK.
+
+(ii)	The distributable portions of the SDK shall only be accessed by your application.
+
+(iii)	 The following notice shall be included in modifications and derivative works of sample source code distributed: “This software contains source code provided by NVIDIA Corporation.”
+
+(iv)	 Unless a developer tool is identified in this Agreement as distributable, it is delivered for your internal use only.
+
+(v)	The terms under which you distribute your application must be consistent with the terms of this Agreement, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA’s intellectual property rights. Additionally, you agree that you will protect the privacy, security and legal rights of your application users.
+
+(vi) You agree to notify NVIDIA in writing of any known or suspected distribution or use of the SDK not in compliance with the requirements of this Agreement, and to enforce the terms of your agreements with respect to distributed SDK.
+
+1.3 Authorized Users
+
+You may allow employees and contractors of your entity or of your subsidiary(ies) to access and use the SDK from your secure network to perform work on your behalf.
+
+If you are an academic institution you may allow users enrolled or employed by the academic institution to access and use the SDK from your secure network.
+
+You are responsible for the compliance with the terms of this Agreement by your authorized users. If you become aware that your authorized users didn’t follow the terms of this Agreement, you agree to take reasonable steps to resolve the non-compliance and prevent new occurrences.
+
+1.4 Pre-Release SDK
+The SDK versions identified as alpha, beta, preview or otherwise as pre-release, may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, accessibility, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. Use of a pre-release SDK may result in unexpected results, loss of data, project delays or other unpredictable damage or loss.
+You may use a pre-release SDK at your own risk, understanding that pre-release SDKs are not intended for use in production or business-critical systems.
+NVIDIA may choose not to make available a commercial version of any pre-release SDK. NVIDIA may also choose to abandon development and terminate the availability of a pre-release SDK at any time without liability.
+1.5	 Updates
+
+NVIDIA may, at its option, make available patches, workarounds or other updates to this SDK. Unless the updates are provided with their separate governing terms, they are deemed part of the SDK licensed to you as provided in this Agreement.
+
+You agree that the form and content of the SDK that NVIDIA provides may change without prior notice to you. While NVIDIA generally maintains compatibility between versions, NVIDIA may in some cases make changes that introduce incompatibilities in future versions of the SDK.
+
+1.6	 Third Party Licenses
+
+The SDK may come bundled with, or otherwise include or be distributed with, third party software licensed by a NVIDIA supplier and/or open source software provided under an open source license. Use of third party software is subject to the third-party license terms, or in the absence of third party terms, the terms of this Agreement. Copyright to third party software is held by the copyright holders indicated in the third-party software or license.
+
+1.7 Reservation of Rights
+
+NVIDIA reserves all rights, title and interest in and to the SDK not expressly granted to you under this Agreement.
+
+2.	Limitations.
+
+The following license limitations apply to your use of the SDK:
+
+2.1	You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SDK or copies of the SDK.
+
+2.2	Except as expressly provided in this Agreement, you may not copy, sell, rent, sublicense, transfer, distribute, modify, or create derivative works of any portion of the SDK. For clarity, you may not distribute or sublicense the SDK as a stand-alone product.
+
+2.3	Unless you have an agreement with NVIDIA for this purpose, you may not indicate that an application created with the SDK is sponsored or endorsed by NVIDIA.
+
+2.4	You may not bypass, disable, or circumvent any encryption, security, digital rights management or authentication mechanism in the SDK.
+
+2.5	You may not use the SDK in any manner that would cause it to become subject to an open source software license. As examples, licenses that require as a condition of use, modification, and/or distribution that the SDK be (i) disclosed or distributed in source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge.
+
+2.6	 Unless you have an agreement with NVIDIA for this purpose, you may not use the SDK with any system or application where the use or failure of the system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss. Examples include use in nuclear, avionics, navigation, military, medical, life support or other life critical applications. NVIDIA does not design, test or manufacture the SDK for these critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses.
+
+2.7	You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney’s fees and costs incident to establishing the right of indemnification) arising out of or related to your use of the SDK outside of the scope of this Agreement, or not in compliance with its terms.
+
+3.	Ownership.
+
+3.1	NVIDIA or its licensors hold all rights, title and interest in and to the SDK and its modifications and derivative works, including their respective intellectual property rights, subject to your rights under Section 3.2. This SDK may include software and materials from NVIDIA’s licensors, and these licensors are intended third party beneficiaries that may enforce this Agreement with respect to their intellectual property rights.
+
+3.2 You hold all rights, title and interest in and to your applications and your derivative works of the sample source code delivered in the SDK, including their respective intellectual property rights, subject to NVIDIA’s rights under section 3.1.
+
+3.3	You may, but don’t have to, provide to NVIDIA suggestions, feature requests or other feedback regarding the SDK, including possible enhancements or modifications to the SDK. For any feedback that you voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute (through multiple tiers of distributors) it without the payment of any royalties or fees to you. NVIDIA will use feedback at its choice. NVIDIA is constantly looking for ways to improve its products, so you may send feedback to NVIDIA through the developer portal at https://developer.nvidia.com.
+
+4.	 No Warranties.
+
+THE SDK IS PROVIDED BY NVIDIA “AS IS” AND “WITH ALL FAULTS.” TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES EXPRESSLY DISCLAIM ALL WARRANTIES OF ANY KIND OR NATURE, WHETHER EXPRESS, IMPLIED OR STATUTORY, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE, NON-INFRINGEMENT, OR THE ABSENCE OF ANY DEFECTS THEREIN, WHETHER LATENT OR PATENT. NO WARRANTY IS MADE ON THE BASIS OF TRADE USAGE, COURSE OF DEALING OR COURSE OF TRADE.
+
+5.	Limitations of Liability.
+
+TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS AGREEMENT OR THE USE OR PERFORMANCE OF THE SDK, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS AGREEMENT EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT.
+
+These exclusions and limitations of liability shall apply regardless if NVIDIA or its affiliates have been advised of the possibility of such damages, and regardless of whether a remedy fails its essential purpose. These exclusions and limitations of liability form an essential basis of the bargain between the parties, and, absent any of these exclusions or limitations of liability, the provisions of this Agreement, including, without limitation, the economic terms, would be substantially different.
+
+6.   Termination.
+
+6.1 This Agreement will continue to apply until terminated by either you or NVIDIA as described below.
+
+6.2 If you want to terminate this Agreement, you may do so by stopping to use the SDK.
+
+6.3 NVIDIA may, at any time, terminate this Agreement if: (i) you fail to comply with any term of this Agreement and the non-compliance is not fixed within thirty (30) days following notice from NVIDIA (or immediately if you violate NVIDIA’s intellectual property rights); (ii) you commence or participate in any legal proceeding against NVIDIA with respect to the SDK; or (iii) NVIDIA decides to no longer provide the SDK in a country or, in NVIDIA’s sole discretion, the continued use of it is no longer commercially viable.
+
+6.4 Upon any termination of this Agreement, you agree to promptly discontinue use of the SDK and destroy all copies in your possession or control. Your prior distributions in accordance with this Agreement are not affected by the termination of this Agreement. Upon written request, you will certify in writing that you have complied with your commitments under this section. Upon any termination of this Agreement all provisions survive except for the license grant provisions.
+
+7.  General.
+
+If you wish to assign this Agreement or your rights and obligations, including by merger, consolidation, dissolution or operation of law, contact NVIDIA to ask for permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect. NVIDIA may assign, delegate or transfer this Agreement and its rights and obligations, and if to a non-affiliate you will be notified.
+
+You agree to cooperate with NVIDIA and provide reasonably requested information to verify your compliance with this Agreement.
+
+This Agreement will be governed in all respects by the laws of the United States and of the State of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English language.
+
+The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this Agreement. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.
+
+If any court of competent jurisdiction determines that any provision of this Agreement is illegal, invalid or unenforceable, such provision will be construed as limited to the extent necessary to be consistent with and fully enforceable under the law and the remaining provisions will remain in full force and effect. Unless otherwise specified, remedies are cumulative.
+
+Each party acknowledges and agrees that the other is an independent contractor in the performance of this Agreement.
+
+The SDK has been developed entirely at private expense and is “commercial items” consisting of “commercial computer software” and “commercial computer software documentation” provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions in this Agreement pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (c)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051.
+
+The SDK is subject to United States export laws and regulations. You agree that you will not ship, transfer or export the SDK into any country, or use the SDK in any manner, prohibited by the United States Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury’s Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws include restrictions on destinations, end users and end use. By accepting this Agreement, you confirm that you are not a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from receiving the SDK.
+
+Any notice delivered by NVIDIA to you under this Agreement will be delivered via mail, email or fax. You agree that any notices that NVIDIA sends you electronically will satisfy any legal communication requirements. Please direct your legal notices or other correspondence to NVIDIA Corporation, 2788 San Tomas Expressway, Santa Clara, California 95051, United States of America, Attention: Legal Department.
+
+This Agreement and any exhibits incorporated into this Agreement constitute the entire agreement of the parties with respect to the subject matter of this Agreement and supersede all prior negotiations or documentation exchanged between the parties relating to this SDK license. Any additional and/or conflicting terms on documents issued by you are null, void, and invalid. Any amendment or waiver under this Agreement shall be in writing and signed by representatives of both parties.
+
+
+CUDA STRING SUPPLEMENT TO SOFTWARE LICENSE AGREEMENT FOR NVIDIA SOFTWARE DEVELOPMENT KITS
+(September 18, 2018 version)
+
+The terms in this supplement govern your use of the NVIDIA CUDA String SDK under the terms of your license agreement (“Agreement”) as modified by this supplement. Capitalized terms used but not defined below have the meaning assigned to them in the Agreement.
+
+This supplement is an exhibit to the Agreement and is incorporated as an integral part of the Agreement. In the event of conflict between the terms in this supplement and the terms in the Agreement, the terms in this supplement govern.
+
+1.	License Scope. The SDK is licensed for you to develop applications only for use in systems with NVIDIA GPUs.
+
+2. Distribution. The following portions of the SDK are distributable under the Agreement: cuString library.
+
+3.	Licensing. If the distribution terms in this Agreement are not suitable for your organization, or for any questions regarding this Agreement, please contact NVIDIA at nvidia-compute-license-questions@nvidia.com.
+
+
diff --git a/cpp/nvgraph/conda-recipes/nvgraph/build.sh b/cpp/nvgraph/conda-recipes/nvgraph/build.sh
new file mode 100644
index 00000000000..ebb2f3177f8
--- /dev/null
+++ b/cpp/nvgraph/conda-recipes/nvgraph/build.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+CMAKE_COMMON_VARIABLES=" -DCMAKE_INSTALL_PREFIX=$PREFIX -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX11_ABI=$CMAKE_CXX11_ABI"
+
+
+if [ -n "$MACOSX_DEPLOYMENT_TARGET" ]; then
+    # C++11 requires 10.9
+    # but cudatoolkit 8 is build for 10.11
+    export MACOSX_DEPLOYMENT_TARGET=10.11
+fi
+
+# show environment
+printenv
+# Cleanup local git
+git clean -xdf
+# Change directory for build process
+cd cpp
+# Use CMake-based build procedure
+mkdir build
+cd build
+# configure
+cmake $CMAKE_COMMON_VARIABLES ..
+# build
+make -j VERBOSE=1 install
\ No newline at end of file
diff --git a/cpp/nvgraph/conda-recipes/nvgraph/meta.yaml b/cpp/nvgraph/conda-recipes/nvgraph/meta.yaml
new file mode 100644
index 00000000000..d13066591aa
--- /dev/null
+++ b/cpp/nvgraph/conda-recipes/nvgraph/meta.yaml
@@ -0,0 +1,27 @@
+# Copyright (c) 2018, NVIDIA CORPORATION.
+
+# Usage:
+#   conda build -c defaults -c conda-forge .
+{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %}
+{% set git_revision_count=environ.get('GIT_DESCRIBE_NUMBER', 0) %}
+{% set cuda_version='.'.join(environ.get('CUDA_VERSION', 'unknown').split('.')[:2]) %}
+package:
+  name: nvgraph
+  version: {{ version }}
+
+source:
+  path: ../..
+
+build:
+  number: {{ git_revision_count }}
+  string: cuda{{ cuda_version }}_{{ git_revision_count }}
+
+requirements:
+  build:
+    - cmake 3.12.4
+
+about:
+  home: http://nvidia.com/
+  license: LICENSE AGREEMENT FOR NVIDIA SOFTWARE DEVELOPMENT KITS
+  license_file: LICENSE
+  summary: nvgraph Library
diff --git a/cpp/nvgraph/conda_build.sh b/cpp/nvgraph/conda_build.sh
new file mode 100755
index 00000000000..4432989676c
--- /dev/null
+++ b/cpp/nvgraph/conda_build.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+set -xe
+
+conda install conda-build anaconda-client conda-verify -y
+conda build -c nvidia -c rapidsai -c conda-forge -c defaults conda-recipes/nvgraph
+
+if [ "$UPLOAD_PACKAGE" == '1' ]; then
+    export UPLOADFILE=`conda build -c nvidia -c rapidsai -c conda-forge -c defaults conda-recipes/nvgraph --output`
+    SOURCE_BRANCH=master
+
+    test -e ${UPLOADFILE}
+    CUDA_REL=${CUDA:0:3}
+    if [ "${CUDA:0:2}" == '10' ]; then
+    # CUDA 10 release
+    CUDA_REL=${CUDA:0:4}
+    fi
+
+    LABEL_OPTION="--label dev --label cuda${CUDA_REL}"
+    if [ "${LABEL_MAIN}" == '1' ]; then
+    LABEL_OPTION="--label main --label cuda${CUDA_REL}"
+    fi
+    echo "LABEL_OPTION=${LABEL_OPTION}"
+
+    if [ -z "$MY_UPLOAD_KEY" ]; then
+    echo "No upload key"
+    return 0
+    fi
+
+    echo "Upload"
+    echo ${UPLOADFILE}
+    anaconda -t ${MY_UPLOAD_KEY} upload -u nvidia ${LABEL_OPTION} --force ${UPLOADFILE}
+else
+    echo "Skipping upload"
+fi
\ No newline at end of file
diff --git a/cpp/nvgraph/cpp/CMakeLists.txt b/cpp/nvgraph/cpp/CMakeLists.txt
new file mode 100644
index 00000000000..42d365400e6
--- /dev/null
+++ b/cpp/nvgraph/cpp/CMakeLists.txt
@@ -0,0 +1,219 @@
+#=============================================================================
+# Copyright (c) 2019, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+cmake_minimum_required(VERSION 3.12 FATAL_ERROR)
+
+project(NV_GRAPH VERSION 0.4.0 LANGUAGES C CXX CUDA)
+
+###################################################################################################
+# - compiler options ------------------------------------------------------------------------------
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_C_COMPILER $ENV{CC})
+set(CMAKE_CXX_COMPILER $ENV{CXX})
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+set(CMAKE_CUDA_STANDARD 11)
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+
+if(CMAKE_COMPILER_IS_GNUCXX)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
+endif(CMAKE_COMPILER_IS_GNUCXX)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_70,code=sm_70")
+
+# set warnings as errors
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Werror cross-execution-space-call -Xcompiler -Wall,-Werror")
+
+# set default build type
+set(CMAKE_BUILD_TYPE "Release")
+
+option(BUILD_TESTS "Configure CMake to build tests"
+       ON)
+
+if(CMAKE_COMPILER_IS_GNUCXX)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
+
+    option(CMAKE_CXX11_ABI "Enable the GLIBCXX11 ABI" OFF)
+    if(CMAKE_CXX11_ABI)
+        message(STATUS "nvGraph: Enabling the GLIBCXX11 ABI")
+    else()
+        message(STATUS "nvGraph: Disabling the GLIBCXX11 ABI")
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
+        set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -D_GLIBCXX_USE_CXX11_ABI=0")
+    endif(CMAKE_CXX11_ABI)
+endif(CMAKE_COMPILER_IS_GNUCXX)
+
+###################################################################################################
+# - cmake modules ---------------------------------------------------------------------------------
+
+set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/" ${CMAKE_MODULE_PATH})
+
+include(FeatureSummary)
+include(CheckIncludeFiles)
+include(CheckLibraryExists)
+
+###################################################################################################
+# - add gtest -------------------------------------------------------------------------------------
+
+if(BUILD_TESTS)
+    include(CTest)
+    include(ConfigureGoogleTest)
+
+    if(GTEST_FOUND)
+        message(STATUS "Google C++ Testing Framework (Google Test) found in ${GTEST_ROOT}")
+        include_directories(${GTEST_INCLUDE_DIR})
+        add_subdirectory(${CMAKE_SOURCE_DIR}/tests)
+    else()
+        message(AUTHOR_WARNING "Google C++ Testing Framework (Google Test) not found: automated tests are disabled.")
+    endif(GTEST_FOUND)
+endif(BUILD_TESTS)
+
+###################################################################################################
+# - include paths ---------------------------------------------------------------------------------
+
+include_directories(
+                    "${CMAKE_BINARY_DIR}/include"
+                    "${CMAKE_SOURCE_DIR}/include"
+                    "${CMAKE_SOURCE_DIR}/thirdparty/cub"
+                    "${CMAKE_SOURCE_DIR}/thirdparty/cnmem/include"
+                    "${CMAKE_SOURCE_DIR}/../external"
+                    "${CMAKE_SOURCE_DIR}/../external/cusp"
+                    "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}"
+                   )
+
+###################################################################################################
+# - library paths ---------------------------------------------------------------------------------
+
+link_directories("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" # CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES is an undocumented/unsupported variable containing the link directories for nvcc
+                 "${CMAKE_BINARY_DIR}/lib"
+                 "${GTEST_LIBRARY_DIR}")
+
+###################################################################################################
+# - library targets -------------------------------------------------------------------------------
+if(NVGRAPH_LIGHT MATCHES True)
+    add_library(nvgraph_rapids SHARED
+                thirdparty/cnmem/src/cnmem.cpp
+                src/arnoldi.cu
+                src/bfs.cu
+                src/bfs2d.cu
+                src/bfs_kernels.cu
+                src/convert.cu
+                src/csrmv.cu
+                src/csrmv_cub.cu
+                src/csr_graph.cpp
+                src/graph_extractor.cu
+                src/jaccard_gpu.cu
+                src/kmeans.cu
+                src/lanczos.cu
+                src/lobpcg.cu
+                src/matrix.cu
+                src/modularity_maximization.cu
+                src/nvgraph.cu
+                src/nvgraph_cusparse.cpp
+                src/nvgraph_cublas.cpp
+                src/nvgraph_error.cu
+                src/nvgraph_lapack.cu
+                src/nvgraph_vector_kernels.cu
+                src/pagerank.cu
+                src/pagerank_kernels.cu
+                src/partition.cu
+                src/size2_selector.cu
+                src/sssp.cu
+                src/triangles_counting.cpp
+                src/triangles_counting_kernels.cu
+                src/valued_csr_graph.cpp
+                src/widest_path.cu
+               )
+else(NVGRAPH_LIGHT MATCHES True)
+        add_library(nvgraph_rapids SHARED
+                thirdparty/cnmem/src/cnmem.cpp
+                src/arnoldi.cu
+                src/bfs.cu
+                src/bfs2d.cu
+                src/bfs_kernels.cu
+                src/convert.cu
+                src/csrmv.cu
+                src/csrmv_cub.cu
+                src/csr_graph.cpp
+                src/graph_extractor.cu
+                src/jaccard_gpu.cu
+                src/kmeans.cu
+                src/lanczos.cu
+                src/lobpcg.cu
+                src/matrix.cu
+                src/modularity_maximization.cu
+                src/nvgraph.cu
+                src/nvgraph_cusparse.cpp
+                src/nvgraph_cublas.cpp
+                src/nvgraph_error.cu
+                src/nvgraph_lapack.cu
+                src/nvgraph_vector_kernels.cu
+                src/pagerank.cu
+                src/pagerank_kernels.cu
+                src/partition.cu
+                src/size2_selector.cu
+                src/sssp.cu
+                src/triangles_counting.cpp
+                src/triangles_counting_kernels.cu
+                src/valued_csr_graph.cpp
+                src/widest_path.cu
+                src/graph_contraction/contraction_csr_max.cu
+                src/graph_contraction/contraction_csr_sum.cu
+                src/graph_contraction/contraction_mv_double_mul.cu
+                src/graph_contraction/contraction_mv_float_min.cu
+                src/graph_contraction/contraction_csr_min.cu
+                src/graph_contraction/contraction_mv_double_max.cu
+                src/graph_contraction/contraction_mv_double_sum.cu
+                src/graph_contraction/contraction_mv_float_mul.cu
+                src/graph_contraction/contraction_csr_mul.cu
+                src/graph_contraction/contraction_mv_double_min.cu
+                src/graph_contraction/contraction_mv_float_max.cu
+                src/graph_contraction/contraction_mv_float_sum.cu
+               )
+endif(NVGRAPH_LIGHT MATCHES True)
+
+###################################################################################################
+# - build options ---------------------------------------------------------------------------------
+
+if(CMAKE_BUILD_TYPE MATCHES Debug)
+    message(STATUS "Building with debugging flags")
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")
+endif(CMAKE_BUILD_TYPE MATCHES Debug)
+
+if(NVGRAPH_LIGHT MATCHES True)
+    add_definitions( -DNVGRAPH_LIGHT=${NVGRAPH_LIGHT} )
+endif(NVGRAPH_LIGHT MATCHES True)
+
+
+###################################################################################################
+# - link libraries --------------------------------------------------------------------------------
+
+target_link_libraries(nvgraph_rapids cublas cusparse curand cusolver cudart )
+
+###################################################################################################
+# - install targets -------------------------------------------------------------------------------
+
+install(TARGETS nvgraph_rapids
+        DESTINATION lib)
+
+install(FILES
+         ${CMAKE_CURRENT_SOURCE_DIR}/include/nvgraph.h
+         ${CMAKE_CURRENT_SOURCE_DIR}/include/test_opt_utils.cuh
+        DESTINATION include/nvgraph)
+
+
diff --git a/cpp/nvgraph/cpp/cmake/Modules/ConfigureGoogleTest.cmake b/cpp/nvgraph/cpp/cmake/Modules/ConfigureGoogleTest.cmake
new file mode 100644
index 00000000000..6120dc51aba
--- /dev/null
+++ b/cpp/nvgraph/cpp/cmake/Modules/ConfigureGoogleTest.cmake
@@ -0,0 +1,55 @@
+set(GTEST_ROOT "${CMAKE_BINARY_DIR}/googletest")
+
+set(GTEST_CMAKE_ARGS " -Dgtest_build_samples=ON" 
+                     " -DCMAKE_VERBOSE_MAKEFILE=ON")
+if(NOT CMAKE_CXX11_ABI)
+    message(STATUS "GTEST: Disabling the GLIBCXX11 ABI")
+    list(APPEND GTEST_CMAKE_ARGS " -DCMAKE_C_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=0")
+    list(APPEND GTEST_CMAKE_ARGS " -DCMAKE_CXX_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=0")
+elseif(CMAKE_CXX11_ABI)
+    message(STATUS "GTEST: Enabling the GLIBCXX11 ABI")
+    list(APPEND GTEST_CMAKE_ARGS " -DCMAKE_C_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=1")
+    list(APPEND GTEST_CMAKE_ARGS " -DCMAKE_CXX_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=1")
+endif(NOT CMAKE_CXX11_ABI)
+
+configure_file("${CMAKE_SOURCE_DIR}/cmake/Templates/GoogleTest.CMakeLists.txt.cmake"
+               "${GTEST_ROOT}/CMakeLists.txt")
+
+file(MAKE_DIRECTORY "${GTEST_ROOT}/build")
+file(MAKE_DIRECTORY "${GTEST_ROOT}/install")
+
+execute_process(COMMAND ${CMAKE_COMMAND} -G ${CMAKE_GENERATOR} .
+                RESULT_VARIABLE GTEST_CONFIG
+                WORKING_DIRECTORY ${GTEST_ROOT})
+
+if(GTEST_CONFIG)
+    message(FATAL_ERROR "Configuring GoogleTest failed: " ${GTEST_CONFIG})
+endif(GTEST_CONFIG)
+
+# Parallel builds cause Travis to run out of memory
+unset(PARALLEL_BUILD)
+if($ENV{TRAVIS})
+    if(NOT DEFINED ENV{CMAKE_BUILD_PARALLEL_LEVEL})
+        message(STATUS "Disabling Parallel CMake build on Travis")
+    else()
+        set(PARALLEL_BUILD --parallel)
+        message(STATUS "Using $ENV{CMAKE_BUILD_PARALLEL_LEVEL} build jobs on Travis")
+    endif(NOT DEFINED ENV{CMAKE_BUILD_PARALLEL_LEVEL})
+else()
+    set(PARALLEL_BUILD --parallel)
+    message("STATUS Enabling Parallel CMake build")
+endif($ENV{TRAVIS})
+
+execute_process(COMMAND ${CMAKE_COMMAND} --build ${PARALLEL_BUILD} ..
+                RESULT_VARIABLE GTEST_BUILD
+                WORKING_DIRECTORY ${GTEST_ROOT}/build)
+
+if(GTEST_BUILD)
+    message(FATAL_ERROR "Building GoogleTest failed: " ${GTEST_BUILD})
+endif(GTEST_BUILD)
+
+message(STATUS "GoogleTest installed here: " ${GTEST_ROOT}/install)
+set(GTEST_INCLUDE_DIR "${GTEST_ROOT}/install/include")
+set(GTEST_LIBRARY_DIR "${GTEST_ROOT}/install/lib")
+set(GTEST_FOUND TRUE)
+
diff --git a/cpp/nvgraph/cpp/cmake/Templates/GoogleTest.CMakeLists.txt.cmake b/cpp/nvgraph/cpp/cmake/Templates/GoogleTest.CMakeLists.txt.cmake
new file mode 100644
index 00000000000..66e1dc85a50
--- /dev/null
+++ b/cpp/nvgraph/cpp/cmake/Templates/GoogleTest.CMakeLists.txt.cmake
@@ -0,0 +1,19 @@
+cmake_minimum_required(VERSION 3.12)
+
+include(ExternalProject)
+
+ExternalProject_Add(GoogleTest
+                    GIT_REPOSITORY    https://github.com/google/googletest.git
+                    GIT_TAG           release-1.8.0
+                    SOURCE_DIR        "${GTEST_ROOT}/googletest"
+                    BINARY_DIR        "${GTEST_ROOT}/build"
+                    INSTALL_DIR		  "${GTEST_ROOT}/install"
+                    CMAKE_ARGS        ${GTEST_CMAKE_ARGS} -DCMAKE_INSTALL_PREFIX=${GTEST_ROOT}/install)
+
+
+
+
+
+
+
+
diff --git a/cpp/nvgraph/cpp/include/2d_partitioning.h b/cpp/nvgraph/cpp/include/2d_partitioning.h
new file mode 100644
index 00000000000..c344990db12
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/2d_partitioning.h
@@ -0,0 +1,1376 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ /*
+ * 2d_partitioning.h
+ *
+ *  Created on: Apr 9, 2018
+ *      Author: jwyles
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <algorithm>
+#include <vector>
+#include <string>
+#include <sstream>
+
+#include <multi_valued_csr_graph.hxx>
+#include <nvgraph_vector.hxx>
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/device/device_run_length_encode.cuh>
+#include <thrust/extrema.h>
+#include <thrust/scan.h>
+#include <thrust/sort.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/tuple.h>
+#include <thrust/execution_policy.h>
+#include <thrust/transform.h>
+
+namespace nvgraph {
+
+	template<typename T, typename W>
+	struct CSR_Result_Weighted {
+		int64_t size;
+		int64_t nnz;
+		T* rowOffsets;
+		T* colIndices;
+		W* edgeWeights;
+
+		CSR_Result_Weighted() :
+				size(0), nnz(0), rowOffsets(NULL), colIndices(NULL), edgeWeights(NULL) {
+		}
+
+		void Destroy() {
+			if (rowOffsets)
+				cudaFree(rowOffsets);
+			if (colIndices)
+				cudaFree(colIndices);
+			if (edgeWeights)
+				cudaFree(edgeWeights);
+		}
+	};
+
+	// Define kernel for copying run length encoded values into offset slots.
+	template<typename T>
+	__global__ void offsetsKernel(T runCounts, T* unique, T* counts, T* offsets) {
+		for (int32_t idx = blockDim.x * blockIdx.x + threadIdx.x;
+				idx < runCounts;
+				idx += gridDim.x * blockDim.x) {
+			offsets[unique[idx]] = counts[idx];
+		}
+	}
+
+	/**
+	 * Method for converting COO to CSR format
+	 * @param sources The array of source indices
+	 * @param destinations The array of destination indices
+	 * @param edgeWeights The array of edge weights
+	 * @param nnz The number of non zero values
+	 * @param maxId The largest id contained in the matrix
+	 * @param result The result is stored here.
+	 */
+	template<typename T, typename W>
+	void ConvertCOOtoCSR_weighted(T* sources,
+											T* destinations,
+											W* edgeWeights,
+											int64_t nnz,
+											T maxId,
+											CSR_Result_Weighted<T, W>& result) {
+		// Sort source and destination columns by source
+		// Allocate local memory for operating on
+		T* srcs, *dests;
+		W* weights = NULL;
+		cudaMalloc(&srcs, sizeof(T) * nnz);
+		cudaMalloc(&dests, sizeof(T) * nnz);
+		if (edgeWeights)
+			cudaMalloc(&weights, sizeof(W) * nnz);
+		cudaMemcpy(srcs, sources, sizeof(T) * nnz, cudaMemcpyDefault);
+		cudaMemcpy(dests, destinations, sizeof(T) * nnz, cudaMemcpyDefault);
+		if (edgeWeights)
+			cudaMemcpy(weights, edgeWeights, sizeof(W) * nnz, cudaMemcpyDefault);
+
+		// Call Thrust::sort_by_key to sort the arrays with srcs as keys:
+		if (edgeWeights)
+			thrust::sort_by_key(thrust::device,
+										srcs,
+										srcs + nnz,
+										thrust::make_zip_iterator(thrust::make_tuple(dests, weights)));
+		else
+			thrust::sort_by_key(thrust::device, srcs, srcs + nnz, dests);
+
+		result.size = maxId + 1;
+
+		// Allocate offsets array
+		cudaMalloc(&result.rowOffsets, (maxId + 2) * sizeof(T));
+
+		// Set all values in offsets array to zeros
+		cudaMemset(result.rowOffsets, 0, (maxId + 2) * sizeof(T));
+
+		// Allocate temporary arrays same size as sources array, and single value to get run counts
+		T* unique, *counts, *runCount;
+		cudaMalloc(&unique, (maxId + 1) * sizeof(T));
+		cudaMalloc(&counts, (maxId + 1) * sizeof(T));
+		cudaMalloc(&runCount, sizeof(T));
+
+		// Use CUB run length encoding to get unique values and run lengths
+		void *tmpStorage = NULL;
+		size_t tmpBytes = 0;
+		cub::DeviceRunLengthEncode::Encode(tmpStorage, tmpBytes, srcs, unique, counts, runCount, nnz);
+		cudaMalloc(&tmpStorage, tmpBytes);
+		cub::DeviceRunLengthEncode::Encode(tmpStorage, tmpBytes, srcs, unique, counts, runCount, nnz);
+		cudaFree(tmpStorage);
+
+		// Set offsets to run sizes for each index
+		T runCount_h;
+		cudaMemcpy(&runCount_h, runCount, sizeof(T), cudaMemcpyDefault);
+		int threadsPerBlock = 1024;
+		int numBlocks = min(65535, (runCount_h + threadsPerBlock - 1) / threadsPerBlock);
+		offsetsKernel<<<numBlocks, threadsPerBlock>>>(runCount_h, unique, counts, result.rowOffsets);
+
+		// Scan offsets to get final offsets
+		thrust::exclusive_scan(thrust::device,
+										result.rowOffsets,
+										result.rowOffsets + maxId + 2,
+										result.rowOffsets);
+
+		// Clean up temporary allocations
+		result.nnz = nnz;
+		result.colIndices = dests;
+		result.edgeWeights = weights;
+		cudaFree(srcs);
+		cudaFree(unique);
+		cudaFree(counts);
+		cudaFree(runCount);
+	}
+
+	/**
+	 * Describes the 2D decomposition of a partitioned matrix.
+	 */
+	template<typename GlobalType, typename LocalType>
+	class MatrixDecompositionDescription {
+	protected:
+		GlobalType numRows; 	// Global number of rows in matrix
+		GlobalType numCols; 	// Global number of columns in matrix
+		GlobalType nnz;			// Global number of non-zeroes in matrix
+		GlobalType blockRows;	// Number of rows of blocks in the decomposition
+		GlobalType blockCols;	// Number of columns of rows in the decomposition
+		LocalType offset;
+		// Offsets-like arrays for rows and columns defining the start/end of the
+		// sections of the global id space belonging to each row and column.
+		std::vector<GlobalType> rowOffsets;
+		std::vector<GlobalType> colOffsets;
+		// Array of integers one for each block, defining the device it is assigned to
+		std::vector<int32_t> deviceAssignments;
+		std::vector<cudaStream_t> blockStreams;
+		public:
+
+		MatrixDecompositionDescription() :
+				numRows(0), numCols(0), nnz(0), blockRows(0), blockCols(0) {
+			rowOffsets.push_back(0);
+			colOffsets.push_back(0);
+			deviceAssignments.push_back(0);
+		}
+
+		// Basic constructor, just takes in the values of its members.
+		MatrixDecompositionDescription(GlobalType numRows,
+													GlobalType numCols,
+													GlobalType nnz,
+													GlobalType blockRows,
+													GlobalType blockCols,
+													std::vector<GlobalType> rowOffsets,
+													std::vector<GlobalType> colOffsets,
+													std::vector<int32_t> deviceAssignments) :
+				numRows(numRows), numCols(numCols), nnz(nnz), blockRows(blockRows),
+						blockCols(blockCols), rowOffsets(rowOffsets), colOffsets(colOffsets),
+						deviceAssignments(deviceAssignments) {
+		}
+
+		// Constructs a MatrixDecompositionDescription for a square matrix given the
+		// number of rows in the matrix and number of rows of blocks.
+		MatrixDecompositionDescription(GlobalType numRows,
+													GlobalType numBlockRows,
+													GlobalType nnz,
+													std::vector<int32_t> devices) :
+				numRows(numRows),
+						numCols(numRows),
+						blockRows(numBlockRows),
+						blockCols(numBlockRows),
+						nnz(nnz) {
+			// Tracking the current set device to change back
+			int currentDevice;
+			cudaGetDevice(&currentDevice);
+
+			// Setting up the row and col offsets into equally sized chunks
+			GlobalType remainder = numRows % blockRows;
+			if (remainder != 0)
+				offset = (numRows + blockRows - remainder) / blockRows;
+			else
+				offset = numRows / blockRows;
+
+			rowOffsets.resize(blockRows + 1);
+			colOffsets.resize(blockRows + 1);
+			for (int i = 0; i < blockRows; i++) {
+				rowOffsets[i] = i * offset;
+				colOffsets[i] = i * offset;
+			}
+			rowOffsets.back() = blockRows * offset;
+			colOffsets.back() = blockCols * offset;
+
+			// Setting up the device assignments using the given device ids and also
+			// setting up the stream associated with each block.
+			deviceAssignments.resize(getNumBlocks());
+			blockStreams.resize(getNumBlocks());
+			for (int i = 0; i < getNumBlocks(); i++) {
+				int device = devices[i % devices.size()];
+				deviceAssignments[i] = device;
+				cudaSetDevice(device);
+				cudaStream_t stream;
+				cudaStreamCreate(&stream);
+				blockStreams[i] = stream;
+			}
+
+			// Restoring to current device when called
+			cudaSetDevice(currentDevice);
+		}
+
+		// Gets the row id for the block containing the given global row id
+		int32_t getRowId(GlobalType val) const {
+			return std::upper_bound(rowOffsets.begin(), rowOffsets.end(), val) - rowOffsets.begin() - 1;
+		}
+
+		// Gets the column id for the block containing the given global column id
+		int32_t getColId(GlobalType val) const {
+			return std::upper_bound(colOffsets.begin(), colOffsets.end(), val) - colOffsets.begin() - 1;
+		}
+
+		// Gets the number of blocks in the decomposition:
+		int32_t getNumBlocks() const {
+			return blockRows * blockCols;
+		}
+
+		// Getter for offset
+		LocalType getOffset() const {
+			return offset;
+		}
+
+		// Getter for deviceAssignments
+		const std::vector<int32_t>& getDeviceAssignments() const {
+			return deviceAssignments;
+		}
+
+		/**
+		 * Getter for vector of streams for each block.
+		 * @return Reference to vector of streams for each block
+		 */
+		const std::vector<cudaStream_t>& getBlockStreams() const {
+			return blockStreams;
+		}
+
+		/**
+		 * Getter for nnz
+		 * @return The global number of non-zero elements
+		 */
+		GlobalType getNnz() const {
+			return nnz;
+		}
+
+		/**
+		 * Getter method for numRows
+		 * @return The number of global rows in the matrix
+		 */
+		GlobalType getNumRows() const {
+			return numRows;
+		}
+
+		/**
+		 * Getter for BlockRows
+		 * @return The number of blocks in a row in the decomposition.
+		 */
+		GlobalType getBlockRows() const {
+			return blockRows;
+		}
+
+		/**
+		 * Getter for BlockCols
+		 * @return The number of blocks in a column in the decomposition.
+		 */
+		GlobalType getBlockCols() const {
+			return blockCols;
+		}
+
+		/**
+		 * Given a block id, returns the row which that block is in.
+		 * @param bId The block ID
+		 * @return The row number
+		 */
+		int32_t getBlockRow(int32_t bId) const {
+			return bId / blockCols;
+		}
+
+		/**
+		 * Given a block id, returns the column which that block is in.
+		 * @param bId The block ID
+		 * @return The column number
+		 */
+		int32_t getBlockCol(int32_t bId) const {
+			return bId % blockCols;
+		}
+
+		/**
+		 * Takes a COO global row and produces the COO local row and the block to which it belongs.
+		 * @param globalRow The global row ID
+		 * @param globalCol The global column ID
+		 * @param localRow The block local row ID (return)
+		 * @param localCol The block local column ID (return)
+		 * @param blockId The block ID (return)
+		 */
+		void convertGlobaltoLocalRow(GlobalType globalRow,
+												GlobalType globalCol,
+												LocalType& localRow,
+												LocalType& localCol,
+												int32_t& blockId) const {
+			int32_t rowId = getRowId(globalRow);
+			int32_t colId = getColId(globalCol);
+			blockId = rowId * blockCols + colId;
+			localRow = globalRow - rowOffsets[rowId];
+			localCol = globalCol - colOffsets[colId];
+		}
+
+		/**
+		 * Takes in a row ID and column ID and returns the corresponding block ID
+		 * @param rowId The row ID
+		 * @param colId The column ID
+		 * @return The ID of the corresponding block
+		 */
+		int32_t getBlockId(int32_t rowId, int32_t colId) const {
+			return rowId * blockCols + colId;
+		}
+
+		/**
+		 * Helper method to synchronize all streams after operations are issued.
+		 */
+		void syncAllStreams() const {
+			int32_t numBlocks = getNumBlocks();
+			int32_t current_device;
+			cudaGetDevice(&current_device);
+			for (int32_t i = 0; i < numBlocks; i++) {
+				cudaSetDevice(deviceAssignments[i]);
+				cudaStreamSynchronize(blockStreams[i]);
+			}
+			cudaSetDevice(current_device);
+		}
+
+		/**
+		 * This method is only for testing and debugging use.
+		 * @return A human readable string representation of the object
+		 */
+		std::string toString() const {
+			std::stringstream ss;
+			ss << "Global Info:\n\tnumRows: " << numRows << ", numCols: " << numCols << ", nnz: "
+					<< nnz;
+			ss << "\n";
+			ss << "Block Info:\n\tblockRows: " << blockRows << ", blockCols: " << blockCols;
+			ss << "\n";
+			ss << "rowOffsets: [";
+			for (int i = 0; i < (int) rowOffsets.size(); i++)
+				ss << rowOffsets[i] << (i == (int) rowOffsets.size() - 1 ? "]\n" : ", ");
+			ss << "colOffsets: [";
+			for (int i = 0; i < (int) colOffsets.size(); i++)
+				ss << colOffsets[i] << (i == (int) colOffsets.size() - 1 ? "]\n" : ", ");
+			ss << "deviceAssignments: [";
+			for (int i = 0; i < (int) deviceAssignments.size(); i++)
+				ss << deviceAssignments[i] << (i == (int) deviceAssignments.size() - 1 ? "]\n" : ", ");
+			return ss.str();
+		}
+	};
+
+	template<typename GlobalType, typename LocalType, typename ValueType>
+	class Matrix2d {
+	protected:
+		// Description of the matrix decomposition
+		MatrixDecompositionDescription<GlobalType, LocalType> description;
+
+		// Array of block matrices forming the decomposition
+		std::vector<MultiValuedCsrGraph<LocalType, ValueType>*> blocks;
+		public:
+		Matrix2d() {
+		}
+		Matrix2d(MatrixDecompositionDescription<GlobalType, LocalType> descr,
+					std::vector<MultiValuedCsrGraph<LocalType, ValueType>*> blocks) :
+				description(descr), blocks(blocks) {
+		}
+
+		const MatrixDecompositionDescription<GlobalType, LocalType>& getMatrixDecompositionDescription() {
+			return description;
+		}
+
+		MultiValuedCsrGraph<LocalType, ValueType>* getBlockMatrix(int32_t bId) {
+			return blocks[bId];
+		}
+
+		std::string toString() {
+			std::stringstream ss;
+			ss << "MatrixDecompositionDescription:\n" << description.toString();
+			for (int i = 0; i < (int) blocks.size(); i++) {
+				ss << "Block " << i << ":\n";
+				size_t numVerts = blocks[i]->get_num_vertices();
+				size_t numEdges = blocks[i]->get_num_edges();
+				size_t numValues = blocks[i]->getNumValues();
+				ss << "numVerts: " << numVerts << ", numEdges: " << numEdges << "\n";
+				LocalType* rowOffsets = (LocalType*) malloc((numVerts + 1) * sizeof(LocalType));
+				LocalType* colIndices = (LocalType*) malloc(numEdges * sizeof(LocalType));
+				ValueType* values = NULL;
+				if (numValues > 0)
+					values = (ValueType*) malloc(numEdges * sizeof(ValueType));
+				cudaMemcpy(rowOffsets,
+								blocks[i]->get_raw_row_offsets(),
+								(numVerts + 1) * sizeof(LocalType),
+								cudaMemcpyDefault);
+				cudaMemcpy(colIndices,
+								blocks[i]->get_raw_column_indices(),
+								numEdges * sizeof(LocalType),
+								cudaMemcpyDefault);
+				if (values)
+					cudaMemcpy(values,
+									blocks[i]->get_raw_edge_dim(0),
+									numEdges * sizeof(ValueType),
+									cudaMemcpyDefault);
+				int idxCount = numEdges >= (numVerts + 1) ? numEdges : (numVerts + 1);
+				ss << "Idx\tOffset\tColInd\tValue\n";
+				for (int j = 0; j < idxCount; j++) {
+					if (j < (int) numVerts + 1 && j < (int) numEdges)
+						ss << j << ":\t" << rowOffsets[j] << "\t" << colIndices[j] << "\t"
+								<< (values ? values[j] : 0)
+								<< "\n";
+					else if (j < (int) numVerts + 1 && j >= (int) numEdges)
+						ss << j << ":\t" << rowOffsets[j] << "\n";
+					else if (j >= (int) numVerts + 1 && j < (int) numEdges)
+						ss << j << ":\t" << "\t" << colIndices[j] << "\t" << (values ? values[j] : 0)
+								<< "\n";
+				}
+				free(rowOffsets);
+				free(colIndices);
+				free(values);
+			}
+			return ss.str();
+		}
+	};
+
+	template<typename GlobalType, typename LocalType, typename ValueType>
+	class VertexData2D {
+		const MatrixDecompositionDescription<GlobalType, LocalType>* description;
+		int32_t n;
+		std::vector<cub::DoubleBuffer<ValueType> > values;
+		public:
+		/**
+		 * Creates a VertexData2D object given a pointer to a MatrixDecompositionDescription
+		 * object which describes the matrix the data is attached to. Data buffers are
+		 * allocated for each block using the offset from the description to size the
+		 * buffers, and to locate the buffers on the same GPU as the matrix block.
+		 */
+		VertexData2D(const MatrixDecompositionDescription<GlobalType, LocalType>* descr) :
+				description(descr) {
+			// Resize the values array to be the same size as number of blocks
+			values.resize(descr->getNumBlocks());
+
+			// Grab the current device id to switch back after allocations are done
+			int current_device;
+			cudaGetDevice(&current_device);
+			LocalType allocSize = descr->getOffset();
+			n = allocSize;
+			// Allocate the data for each block
+			for (size_t i = 0; i < descr->getDeviceAssignments().size(); i++) {
+				int device = descr->getDeviceAssignments()[i];
+				cudaSetDevice(device);
+				ValueType* d_current, *d_alternate;
+				cudaMalloc(&d_current, sizeof(ValueType) * n);
+				cudaMalloc(&d_alternate, sizeof(ValueType) * n);
+				values[i].d_buffers[0] = d_current;
+				values[i].d_buffers[1] = d_alternate;
+			}
+
+			// Set the device back to what it was initially
+			cudaSetDevice(current_device);
+		}
+
+		/**
+		 * Creates a VertexData2D object given a pointer to a MatrixDecompositionDescription
+		 * object, which describes the matrix the data is attached to, and an integer which indicates
+		 * how many data elements should be allocated for each block. Data buffers are allocated
+		 * for each block using the offset from the description to size the buffers, and to locate
+		 * the buffers on the same GPU as the matrix block.
+		 */
+		VertexData2D(const MatrixDecompositionDescription<GlobalType, LocalType>* descr, size_t _n) :
+				description(descr) {
+			// Resize the values array to be the same size as number of blocks
+			values.resize(descr->getNumBlocks());
+
+			// Grab the current device id to switch back after allocations are done
+			int current_device;
+			cudaGetDevice(&current_device);
+			LocalType allocSize = _n;
+			n = allocSize;
+			// Allocate the data for each block
+			for (size_t i = 0; i < descr->getDeviceAssignments().size(); i++) {
+				int device = descr->getDeviceAssignments()[i];
+				cudaSetDevice(device);
+				ValueType* d_current, *d_alternate;
+				cudaMalloc(&d_current, sizeof(ValueType) * n);
+				cudaMalloc(&d_alternate, sizeof(ValueType) * n);
+				values[i].d_buffers[0] = d_current;
+				values[i].d_buffers[1] = d_alternate;
+			}
+
+			// Set the device back to what it was initially
+			cudaSetDevice(current_device);
+		}
+
+		~VertexData2D() {
+			for (size_t i = 0; i < values.size(); i++) {
+				if (values[i].Current())
+					cudaFree(values[i].Current());
+				if (values[i].Alternate())
+					cudaFree(values[i].Alternate());
+			}
+		}
+
+		/**
+		 * Getter for n the size of each block's allocation in elements.
+		 * @return The value of n
+		 */
+		int32_t getN() {
+			return n;
+		}
+
+		/**
+		 * Getter for the MatrixDecompositionDescription associated with this VertexData2D
+		 * @return Pointer to the MatrixDecompositionDescription for this VertexData2D
+		 */
+		const MatrixDecompositionDescription<GlobalType, LocalType>* getDescription() {
+			return description;
+		}
+
+		/**
+		 * Gets the current buffer corresponding to the given block ID
+		 */
+		ValueType* getCurrent(int bId) {
+			return values[bId].Current();
+		}
+
+		/**
+		 * Gets the alternate buffer corresponding to the given block ID
+		 */
+		ValueType* getAlternate(int bId) {
+			return values[bId].Alternate();
+		}
+
+		/**
+		 * Swaps the current and alternate buffers for all block IDs
+		 */
+		void swapBuffers() {
+			for (size_t i = 0; i < values.size(); i++)
+				values[i].selector ^= 1;
+		}
+
+		/**
+		 * Sets an element in the global array, assuming that the data is currently
+		 * valid and in the diagonal blocks. After calling this method either columnScatter
+		 * or rowScatter should be called to propagate the change to all blocks.
+		 */
+		void setElement(GlobalType globalIndex, ValueType val) {
+			LocalType blockId = globalIndex / n;
+			LocalType blockOffset = globalIndex % n;
+			int32_t bId = description->getBlockId(blockId, blockId);
+			ValueType* copyTo = values[bId].Current() + blockOffset;
+			cudaMemcpy(copyTo, &val, sizeof(ValueType), cudaMemcpyDefault);
+		}
+
+		/**
+		 * Sets the elements of the global array, using the provided array of values. The values
+		 * are set in the blocks of the diagonal, columnScatter or rowScatter should be called
+		 * to propogate to all blocks.
+		 * @param vals Pointer to an array with the values to be set.
+		 */
+		void setElements(ValueType* vals) {
+			LocalType offset = description->getOffset();
+			int32_t numRows = description->getBlockRows();
+			for (int i = 0; i < numRows; i++) {
+				int32_t id = description->getBlockId(i, i);
+				cudaStream_t stream = description->getBlockStreams()[id];
+				ValueType* copyFrom = vals + i * n;
+				ValueType* copyTo = values[id].Current();
+				cudaMemcpyAsync(copyTo, copyFrom, sizeof(ValueType) * n, cudaMemcpyDefault, stream);
+			}
+			description->syncAllStreams();
+		}
+
+		/**
+		 * Fills the elements of the data array with the given value.
+		 * The elements on the diagonal are filled with the given value. After filling,
+		 * either rowScatter or columnScatter will copy the values across the blocks in
+		 * either the rows or columns depending on the use.
+		 * @param val The value to fill the array with
+		 */
+		void fillElements(ValueType val) {
+			int current_device;
+			cudaGetDevice(&current_device);
+			int32_t numRows = description->getBlockRows();
+			for (int32_t i = 0; i < numRows; i++) {
+				int32_t blockId = description->getBlockId(i, i);
+				ValueType* vals = getCurrent(blockId);
+				int deviceId = description->getDeviceAssignments()[blockId];
+				cudaStream_t stream = description->getBlockStreams()[blockId];
+				cudaSetDevice(deviceId);
+				thrust::fill(thrust::cuda::par.on(stream), vals, vals + n, val);
+			}
+			description->syncAllStreams();
+			cudaSetDevice(current_device);
+		}
+
+		/**
+		 * Copies the values of the diagonal blocks in this VertexData2D into the
+		 * VertexData2D specified.
+		 * @param other Pointer to the VertexData2D to copy into
+		 */
+		void copyTo(VertexData2D<GlobalType, LocalType, ValueType>* other) {
+			const MatrixDecompositionDescription<GlobalType, LocalType>* otherDescr =
+					other->getDescription();
+			// Do a quick check that the sizes of both block arrays are the same.
+			if (description->getBlockRows() == otherDescr->getBlockRows() && n == other->getN()) {
+				// Issue asynchronous copies for each block's data
+				for (int i = 0; i < description->getBlockRows(); i++) {
+					int32_t bId = description->getBlockId(i, i);
+					ValueType* copyFrom = getCurrent(bId);
+					ValueType* copyTo = other->getCurrent(bId);
+					cudaStream_t stream = description->getBlockStreams()[bId];
+					cudaMemcpyAsync(copyTo, copyFrom, n * sizeof(ValueType), cudaMemcpyDefault, stream);
+				}
+				// Synchronize the streams after the copies are done
+				for (int i = 0; i < description->getBlockRows(); i++) {
+					int32_t bId = description->getBlockId(i, i);
+					cudaStream_t stream = description->getBlockStreams()[bId];
+					cudaStreamSynchronize(stream);
+				}
+			}
+		}
+
+		/**
+		 * This method implements a row-wise reduction of each blocks data into a
+		 * single array for each row. The block on the diagonal will have the result.
+		 */
+		template<typename Operator>
+		void rowReduce() {
+			int current_device;
+			cudaGetDevice(&current_device);
+			Operator op;
+
+			// For each row in the decomposition:
+			int32_t numRows = description->getBlockRows();
+			std::vector<int32_t> blockIds;
+			for (int32_t i = 0; i < numRows; i++) {
+				// Put all the block ids for the row into a vector, with the ID of the diagonal block
+				// at index 0.
+				std::vector<int32_t> blockIds;
+				blockIds.push_back(-1);
+				for (int32_t j = 0; j < numRows; j++) {
+					if (i == j) {
+						blockIds[0] = description->getBlockId(i, j);
+					}
+					else {
+						blockIds.push_back(description->getBlockId(i, j));
+					}
+				}
+
+				// Do a binary tree reduction. At each step the primary buffer of the sender is
+				// copied into the secondary buffer of the receiver. After the copy is done
+				// each receiver performs the reduction operator and stores the result in it's
+				// primary buffer.
+				for (int32_t j = 2; (j / 2) < numRows; j *= 2) {
+					for (int32_t id = 0; id < numRows; id++) {
+						if (id % j == 0 && id + j / 2 < numRows) {
+							// blockIds[id] is the receiver
+							int32_t receiverId = blockIds[id];
+
+							// blockIds[id + j/2] is the sender
+							int32_t senderId = blockIds[id + j / 2];
+
+							// Get the stream associated with the receiver's block id
+							cudaStream_t stream = description->getBlockStreams()[receiverId];
+
+							// Copy from the sender to the receiver (use stream associated with receiver)
+							cudaMemcpyAsync(values[receiverId].Alternate(),
+													values[senderId].Current(),
+													sizeof(ValueType) * n,
+													cudaMemcpyDefault,
+													stream);
+
+							// Invoke the reduction operator on the receiver's GPU and values arrays.
+							cudaSetDevice(description->getDeviceAssignments()[receiverId]);
+							ValueType* input1 = values[receiverId].Alternate();
+							ValueType* input2 = values[receiverId].Current();
+							thrust::transform(thrust::cuda::par.on(stream),
+													input1,
+													input1 + n,
+													input2,
+													input2,
+													op);
+						}
+					}
+					// Sync all active streams before next step
+					for (int32_t id = 0; id < numRows; id++) {
+						if (id % j == 0 && id + j / 2 < numRows) {
+							// blockIds[id] is the receiver
+							int32_t receiverId = blockIds[id];
+
+							// Set the device to the receiver and sync the stream
+							cudaSetDevice(description->getDeviceAssignments()[receiverId]);
+							cudaStreamSynchronize(description->getBlockStreams()[receiverId]);
+						}
+					}
+				}
+			}
+
+			cudaSetDevice(current_device);
+		}
+
+		/**
+		 * This method implements a column-wise reduction of each blocks data into a
+		 * single array for each column. The block on the diagonal will have the result.
+		 */
+		template<typename Operator>
+		void columnReduce() {
+			int current_device;
+			cudaGetDevice(&current_device);
+			Operator op;
+
+			// For each column in the decomposition:
+			int32_t numRows = description->getBlockRows();
+			std::vector<int32_t> blockIds;
+			for (int32_t i = 0; i < numRows; i++) {
+				// Put all the block ids for the row into a vector, with the ID of the diagonal block
+				// at index 0.
+				std::vector<int32_t> blockIds;
+				blockIds.push_back(-1);
+				for (int32_t j = 0; j < numRows; j++) {
+					if (i == j) {
+						blockIds[0] = description->getBlockId(j, i);
+					}
+					else {
+						blockIds.push_back(description->getBlockId(j, i));
+					}
+				}
+
+				// Do a binary tree reduction. At each step the primary buffer of the sender is
+				// copied into the secondary buffer of the receiver. After the copy is done
+				// each receiver performs the reduction operator and stores the result in it's
+				// primary buffer.
+				for (int32_t j = 2; (j / 2) < numRows; j *= 2) {
+					for (int32_t id = 0; id < numRows; id++) {
+						if (id % j == 0 && id + j / 2 < numRows) {
+							// blockIds[id] is the receiver
+							int32_t receiverId = blockIds[id];
+
+							// blockIds[id + j/2] is the sender
+							int32_t senderId = blockIds[id + j / 2];
+
+							// Get the stream associated with the receiver's block id
+							cudaStream_t stream = description->getBlockStreams()[receiverId];
+
+							// Copy from the sender to the receiver (use stream associated with receiver)
+							cudaMemcpyAsync(values[receiverId].Alternate(),
+													values[senderId].Current(),
+													sizeof(ValueType) * n,
+													cudaMemcpyDefault,
+													stream);
+
+							// Invoke the reduction operator on the receiver's GPU and values arrays.
+							cudaSetDevice(description->getDeviceAssignments()[receiverId]);
+							ValueType* input1 = values[receiverId].Alternate();
+							ValueType* input2 = values[receiverId].Current();
+							thrust::transform(thrust::cuda::par.on(stream),
+													input1,
+													input1 + n,
+													input2,
+													input2,
+													op);
+						}
+					}
+					// Sync all active streams before next step
+					for (int32_t id = 0; id < numRows; id++) {
+						if (id % j == 0 && id + j / 2 < numRows) {
+							// blockIds[id] is the receiver
+							int32_t receiverId = blockIds[id];
+
+							// Set the device to the receiver and sync the stream
+							cudaSetDevice(description->getDeviceAssignments()[receiverId]);
+							cudaStreamSynchronize(description->getBlockStreams()[receiverId]);
+						}
+					}
+				}
+			}
+
+			cudaSetDevice(current_device);
+		}
+
+		/**
+		 * This implements a column-wise scatter of the global data from the corresponding
+		 * row. i.e. The data reduced from row 1 is broadcast to all blocks in
+		 * column 1. It is assumed that the data to broadcast is located in the block on
+		 * the diagonal.
+		 */
+		void columnScatter() {
+			int current_device;
+			cudaGetDevice(&current_device);
+
+			// For each column in the decomposition:
+			int32_t numRows = description->getBlockRows();
+			std::vector<int32_t> blockIds;
+			for (int32_t i = 0; i < numRows; i++) {
+				// Put all the block ids for the column into a vector, with the ID of the diagonal block
+				// at index 0.
+				std::vector<int32_t> blockIds;
+				blockIds.push_back(-1);
+				for (int32_t j = 0; j < numRows; j++) {
+					if (i == j) {
+						blockIds[0] = description->getBlockId(j, i);
+					}
+					else {
+						blockIds.push_back(description->getBlockId(j, i));
+					}
+				}
+
+				// Do a binary tree scatter. At each step the primary buffer of the sender is
+				// copied into the primary buffer of the receiver.
+				int32_t max2pow = 2;
+				while (max2pow < numRows) {
+					max2pow *= 2;
+				}
+				for (int32_t j = max2pow; j >= 2; j /= 2) {
+					for (int32_t id = 0; id < numRows; id++) {
+						if (id % j == 0 && id + j / 2 < numRows) {
+							// blockIds[id] is the sender
+							int32_t senderId = blockIds[id];
+
+							// blockIds[id + j/2] is the sender
+							int32_t receiverId = blockIds[id + j / 2];
+
+							// Get the stream associated with the receiver's block id
+							cudaStream_t stream = description->getBlockStreams()[receiverId];
+
+							// Copy from the sender to the receiver (use stream associated with receiver)
+							cudaMemcpyAsync(values[receiverId].Current(),
+													values[senderId].Current(),
+													sizeof(ValueType) * n,
+													cudaMemcpyDefault,
+													stream);
+						}
+					}
+					// Synchronize all the active streams before next step.
+					for (int32_t id = 0; id < numRows; id++) {
+						if (id % j == 0 && id + j / 2 < numRows) {
+							// blockIds[id + j/2] is the sender
+							int32_t receiverId = blockIds[id + j / 2];
+
+							// Set device and sync receiver's stream
+							cudaSetDevice(description->getDeviceAssignments()[receiverId]);
+							cudaStreamSynchronize(description->getBlockStreams()[receiverId]);
+						}
+					}
+				}
+			}
+
+			cudaSetDevice(current_device);
+		}
+
+		/**
+		 * This implements a row-wise scatter of the global data from the corresponding
+		 * column. i.e. The data reduced from column 1 is broadcast to all blocks in
+		 * row 1. It is assumed that the data to broadcast is located in the block on
+		 * the diagonal.
+		 */
+		void rowScatter() {
+			int current_device;
+			cudaGetDevice(&current_device);
+
+			// For each row in the decomposition:
+			int32_t numRows = description->getBlockRows();
+			std::vector<int32_t> blockIds;
+			for (int32_t i = 0; i < numRows; i++) {
+				// Put all the block ids for the column into a vector, with the ID of the diagonal block
+				// at index 0.
+				std::vector<int32_t> blockIds;
+				blockIds.push_back(-1);
+				for (int32_t j = 0; j < numRows; j++) {
+					if (i == j) {
+						blockIds[0] = description->getBlockId(i, j);
+					}
+					else {
+						blockIds.push_back(description->getBlockId(i, j));
+					}
+				}
+
+				// Do a binary tree scatter. At each step the primary buffer of the sender is
+				// copied into the primary buffer of the receiver.
+				int32_t max2pow = 2;
+				while (max2pow < numRows) {
+					max2pow *= 2;
+				}
+				for (int32_t j = max2pow; j >= 2; j /= 2) {
+					for (int32_t id = 0; id < numRows; id++) {
+						if (id % j == 0 && id + j / 2 < numRows) {
+							// blockIds[id] is the sender
+							int32_t senderId = blockIds[id];
+
+							// blockIds[id + j/2] is the receiver
+							int32_t receiverId = blockIds[id + j / 2];
+
+							// Get the stream associated with the receiver's block id
+							cudaStream_t stream = description->getBlockStreams()[receiverId];
+
+							// Copy from the sender to the receiver (use stream associated with receiver)
+							cudaMemcpyAsync(values[receiverId].Current(),
+													values[senderId].Current(),
+													sizeof(ValueType) * n,
+													cudaMemcpyDefault,
+													stream);
+						}
+					}
+					// Sync all the active streams before next step
+					for (int32_t id = 0; id < numRows; id++) {
+						if (id % j == 0 && id + j / 2 < numRows) {
+							// blockIds[id + j/2] is the receiver
+							int32_t receiverId = blockIds[id + j / 2];
+
+							// Set device and sync receiver's stream
+							cudaSetDevice(description->getDeviceAssignments()[receiverId]);
+							cudaStreamSynchronize(description->getBlockStreams()[receiverId]);
+						}
+					}
+				}
+			}
+
+			cudaSetDevice(current_device);
+		}
+
+		/**
+		 * Outputs a human readable string representation of this Vertex2d object. This is only
+		 * intended to be used for de-bugging.
+		 * @return Human readable string representation
+		 */
+		std::string toString() {
+			std::stringstream ss;
+			ValueType* c = (ValueType*) malloc(sizeof(ValueType) * n);
+			ValueType* a = (ValueType*) malloc(sizeof(ValueType) * n);
+
+			int32_t numBlocks = description->getNumBlocks();
+
+			ss << "Vertex2d:\n";
+			for (int32_t i = 0; i < numBlocks; i++) {
+				ss << "Block " << i << ":\n";
+				ss << "Idx\tCur\tAlt\n";
+				cudaMemcpy(c, values[i].Current(), sizeof(ValueType) * n, cudaMemcpyDefault);
+				cudaMemcpy(a, values[i].Alternate(), sizeof(ValueType) * n, cudaMemcpyDefault);
+				for (int32_t j = 0; j < n; j++) {
+					ss << j << ":\t" << c[j] << "\t" << a[j] << "\n";
+				}
+			}
+
+			free(c);
+			free(a);
+
+			return ss.str();
+		}
+	};
+
+	template<typename GlobalType, typename LocalType, typename ValueType>
+	class VertexData2D_Unbuffered {
+		const MatrixDecompositionDescription<GlobalType, LocalType>* description;
+		int32_t n;
+		std::vector<ValueType*> values;
+
+	public:
+		/**
+		 * Sets up a VertexData2D_Unbuffered object with an element allocated for each vertex
+		 * in each block.
+		 * @param descr Pointer to a MatrixDecompositionDescription object describing the layout
+		 * of the 2D blocks.
+		 */
+		VertexData2D_Unbuffered(const MatrixDecompositionDescription<GlobalType, LocalType>* descr) :
+				description(descr) {
+			// Resize the values array to be the same size as number of blocks
+			values.resize(descr->getNumBlocks());
+
+			// Grab the current device id to switch back after allocations are done
+			int current_device;
+			cudaGetDevice(&current_device);
+			LocalType allocSize = descr->getOffset();
+			n = allocSize;
+			// Allocate the data for each block
+			for (size_t i = 0; i < descr->getDeviceAssignments().size(); i++) {
+				int device = descr->getDeviceAssignments()[i];
+				cudaSetDevice(device);
+				cudaMalloc(&(values[i]), sizeof(ValueType) * n);
+			}
+
+			// Set the device back to what it was initially
+			cudaSetDevice(current_device);
+		}
+
+		/**
+		 * Sets up a VertexData2D_Unbuffered object with _n elements allocated per block.
+		 * @param descr Pointer to a MatrixDecompositionDescription object describing the layout
+		 * of the 2D blocks.
+		 * @param _n The number of elements to allocate per block.
+		 */
+		VertexData2D_Unbuffered(const MatrixDecompositionDescription<GlobalType, LocalType>* descr,
+										size_t _n) :
+				description(descr), n(_n) {
+			// Resize the values array to be the same size as number of blocks
+			values.resize(descr->getNumBlocks());
+
+			// Grab the current device id to switch back after allocations are done
+			int current_device;
+			cudaGetDevice(&current_device);
+			// Allocate the data for each block
+			for (size_t i = 0; i < descr->getDeviceAssignments().size(); i++) {
+				int device = descr->getDeviceAssignments()[i];
+				cudaSetDevice(device);
+				cudaMalloc(&(values[i]), sizeof(ValueType) * n);
+			}
+
+			// Set the device back to what it was initially
+			cudaSetDevice(current_device);
+		}
+
+		/**
+		 * Destructor. Frees all allocated memory.
+		 */
+		~VertexData2D_Unbuffered() {
+			for (size_t i = 0; i < values.size(); i++) {
+				if (values[i]) {
+					cudaFree(values[i]);
+				}
+			}
+		}
+
+		/**
+		 * Fills the elements of the data array with the given value.
+		 * The elements on the diagonal are filled with the given value. After filling,
+		 * either rowScatter or columnScatter will copy the values across the blocks in
+		 * either the rows or columns depending on the use.
+		 * @param val The value to fill the array with
+		 */
+		void fillElements(ValueType val) {
+			int current_device;
+			cudaGetDevice(&current_device);
+			int32_t numRows = description->getBlockRows();
+			for (int32_t i = 0; i < numRows; i++) {
+				int32_t blockId = description->getBlockId(i, i);
+				ValueType* vals = get(blockId);
+				int deviceId = description->getDeviceAssignments()[blockId];
+				cudaStream_t stream = description->getBlockStreams()[blockId];
+				cudaSetDevice(deviceId);
+				thrust::fill(thrust::cuda::par.on(stream), vals, vals + n, val);
+			}
+			description->syncAllStreams();
+			cudaSetDevice(current_device);
+		}
+
+		/**
+		 * This implements a column-wise scatter of the global data from the corresponding
+		 * row. i.e. The data reduced from row 1 is broadcast to all blocks in
+		 * column 1. It is assumed that the data to broadcast is located in the block on
+		 * the diagonal.
+		 */
+		void columnScatter() {
+			int current_device;
+			cudaGetDevice(&current_device);
+
+			// For each column in the decomposition:
+			int32_t numRows = description->getBlockRows();
+			std::vector<int32_t> blockIds;
+			for (int32_t i = 0; i < numRows; i++) {
+				// Put all the block ids for the column into a vector, with the ID of the diagonal block
+				// at index 0.
+				std::vector<int32_t> blockIds;
+				blockIds.push_back(-1);
+				for (int32_t j = 0; j < numRows; j++) {
+					if (i == j) {
+						blockIds[0] = description->getBlockId(j, i);
+					}
+					else {
+						blockIds.push_back(description->getBlockId(j, i));
+					}
+				}
+
+				// Do a binary tree scatter. At each step the primary buffer of the sender is
+				// copied into the primary buffer of the receiver.
+				int32_t max2pow = 2;
+				while (max2pow < numRows) {
+					max2pow *= 2;
+				}
+				for (int32_t j = max2pow; j >= 2; j /= 2) {
+					for (int32_t id = 0; id < numRows; id++) {
+						if (id % j == 0 && id + j / 2 < numRows) {
+							// blockIds[id] is the sender
+							int32_t senderId = blockIds[id];
+
+							// blockIds[id + j/2] is the sender
+							int32_t receiverId = blockIds[id + j / 2];
+
+							// Get the stream associated with the receiver's block id
+							cudaStream_t stream = description->getBlockStreams()[receiverId];
+
+							// Copy from the sender to the receiver (use stream associated with receiver)
+							cudaMemcpyAsync(values[receiverId],
+													values[senderId],
+													sizeof(ValueType) * n,
+													cudaMemcpyDefault,
+													stream);
+						}
+					}
+					// Synchronize all the active streams before next step.
+					for (int32_t id = 0; id < numRows; id++) {
+						if (id % j == 0 && id + j / 2 < numRows) {
+							// blockIds[id + j/2] is the sender
+							int32_t receiverId = blockIds[id + j / 2];
+
+							// Set device and sync receiver's stream
+							cudaSetDevice(description->getDeviceAssignments()[receiverId]);
+							cudaStreamSynchronize(description->getBlockStreams()[receiverId]);
+						}
+					}
+				}
+			}
+
+			cudaSetDevice(current_device);
+		}
+
+		/**
+		 * This implements a row-wise scatter of the global data from the corresponding
+		 * column. i.e. The data reduced from column 1 is broadcast to all blocks in
+		 * row 1. It is assumed that the data to broadcast is located in the block on
+		 * the diagonal.
+		 */
+		void rowScatter() {
+			int current_device;
+			cudaGetDevice(&current_device);
+
+			// For each row in the decomposition:
+			int32_t numRows = description->getBlockRows();
+			std::vector<int32_t> blockIds;
+			for (int32_t i = 0; i < numRows; i++) {
+				// Put all the block ids for the column into a vector, with the ID of the diagonal block
+				// at index 0.
+				std::vector<int32_t> blockIds;
+				blockIds.push_back(-1);
+				for (int32_t j = 0; j < numRows; j++) {
+					if (i == j) {
+						blockIds[0] = description->getBlockId(i, j);
+					}
+					else {
+						blockIds.push_back(description->getBlockId(i, j));
+					}
+				}
+
+				// Do a binary tree scatter. At each step the primary buffer of the sender is
+				// copied into the primary buffer of the receiver.
+				int32_t max2pow = 2;
+				while (max2pow < numRows) {
+					max2pow *= 2;
+				}
+				for (int32_t j = max2pow; j >= 2; j /= 2) {
+					for (int32_t id = 0; id < numRows; id++) {
+						if (id % j == 0 && id + j / 2 < numRows) {
+							// blockIds[id] is the sender
+							int32_t senderId = blockIds[id];
+
+							// blockIds[id + j/2] is the receiver
+							int32_t receiverId = blockIds[id + j / 2];
+
+							// Get the stream associated with the receiver's block id
+							cudaStream_t stream = description->getBlockStreams()[receiverId];
+
+							// Copy from the sender to the receiver (use stream associated with receiver)
+							cudaMemcpyAsync(values[receiverId],
+													values[senderId],
+													sizeof(ValueType) * n,
+													cudaMemcpyDefault,
+													stream);
+						}
+					}
+					// Sync all the active streams before next step
+					for (int32_t id = 0; id < numRows; id++) {
+						if (id % j == 0 && id + j / 2 < numRows) {
+							// blockIds[id + j/2] is the receiver
+							int32_t receiverId = blockIds[id + j / 2];
+
+							// Set device and sync receiver's stream
+							cudaSetDevice(description->getDeviceAssignments()[receiverId]);
+							cudaStreamSynchronize(description->getBlockStreams()[receiverId]);
+						}
+					}
+				}
+			}
+
+			cudaSetDevice(current_device);
+		}
+
+		/**
+		 * Getter for n
+		 * @return The value of n
+		 */
+		int32_t getN() {
+			return n;
+		}
+
+		/**
+		 * Gets the pointer to the allocated memory for a specified block.
+		 * @param bId The block id to get the memory for.
+		 * @return A pointer to the allocated memory for the given block.
+		 */
+		ValueType* get(int32_t bId) {
+			return values[bId];
+		}
+	};
+
+	/**
+	 * This method takes in COO format matrix data and a MatrixDecompositionDescription and
+	 * returns a Matrix2d object containing the given data.
+	 */
+	template<typename GlobalType, typename LocalType, typename ValueType>
+	Matrix2d<GlobalType, LocalType, ValueType> COOto2d(MatrixDecompositionDescription<GlobalType,
+																				LocalType> descr,
+																		GlobalType* rowIds,
+																		GlobalType* colIds,
+																		ValueType* values) {
+		// Grab the current device id to switch back after allocations are done
+		int current_device;
+		cudaGetDevice(&current_device);
+
+		int32_t blockCount = descr.getNumBlocks();
+
+		// Allocate array of size global nnz to hold the block labels
+		int32_t* blockLabels = (int32_t*) malloc(descr.getNnz() * sizeof(int32_t));
+
+		// Allocate array to contain row counts for each block and initialize to zero
+		// Allocate array to contain position offsets for writing each blocks data
+		LocalType* blockCounts = (LocalType*) malloc(blockCount * sizeof(LocalType));
+		LocalType* blockPos = (LocalType*) malloc(blockCount * sizeof(LocalType));
+		for (int i = 0; i < blockCount; i++) {
+			blockCounts[i] = 0;
+			blockPos[i] = 0;
+		}
+
+		// For each edge mark in the array the id of the block to which it will belong
+		int32_t blockId;
+		LocalType localRow;
+		LocalType localCol;
+		for (int i = 0; i < descr.getNnz(); i++) {
+			descr.convertGlobaltoLocalRow(rowIds[i], colIds[i], localRow, localCol, blockId);
+			blockLabels[i] = blockId;
+			blockCounts[blockId]++;
+		}
+
+		// Allocate arrays for putting each blocks data into
+		LocalType** blockRowIds = (LocalType**) malloc(blockCount * sizeof(LocalType*));
+		LocalType** blockColIds = (LocalType**) malloc(blockCount * sizeof(LocalType*));
+		ValueType** blockValues = NULL;
+		if (values)
+			blockValues = (ValueType**) malloc(blockCount * sizeof(ValueType*));
+		for (int i = 0; i < blockCount; i++) {
+			blockRowIds[i] = (LocalType*) malloc(blockCounts[i] * sizeof(LocalType));
+			blockColIds[i] = (LocalType*) malloc(blockCounts[i] * sizeof(LocalType));
+			if (values)
+				blockValues[i] = (ValueType*) malloc(blockCounts[i] * sizeof(ValueType));
+		}
+
+		// Convert each blocks global rows to local ids and copy into block arrays
+		for (int i = 0; i < descr.getNnz(); i++) {
+			descr.convertGlobaltoLocalRow(rowIds[i], colIds[i], localRow, localCol, blockId);
+			blockRowIds[blockId][blockPos[blockId]] = localRow;
+			blockColIds[blockId][blockPos[blockId]] = localCol;
+			if (values)
+				blockValues[blockId][blockPos[blockId]] = values[i];
+			blockPos[blockId]++;
+		}
+
+		// Allocate the result blocks vector
+		std::vector<MultiValuedCsrGraph<LocalType, ValueType>*> blockVector(blockCount);
+
+		// Convert each blocks COO rows into CSR and create it's graph object.
+		for (int i = 0; i < blockCount; i++) {
+			// Set the device as indicated so the data ends up on the right GPU
+			cudaSetDevice(descr.getDeviceAssignments()[i]);
+			cudaStream_t stream = descr.getBlockStreams()[i];
+
+			if (blockCounts[i] > 0) {
+				CSR_Result_Weighted<LocalType, ValueType> result;
+				ConvertCOOtoCSR_weighted(blockRowIds[i],
+													blockColIds[i],
+													values ? blockValues[i] : NULL,
+													(int64_t) blockCounts[i],
+													(descr.getOffset() - 1),
+													result);
+				MultiValuedCsrGraph<LocalType, ValueType>* csrGraph = new MultiValuedCsrGraph<LocalType,
+						ValueType>((size_t) result.size, (size_t) result.nnz, stream);
+				if (values)
+					csrGraph->allocateEdgeData(1, NULL);
+				cudaMemcpy(csrGraph->get_raw_row_offsets(),
+								result.rowOffsets,
+								(result.size + 1) * sizeof(LocalType),
+								cudaMemcpyDefault);
+				cudaMemcpy(csrGraph->get_raw_column_indices(),
+								result.colIndices,
+								result.nnz * sizeof(LocalType),
+								cudaMemcpyDefault);
+				if (values)
+					cudaMemcpy(csrGraph->get_raw_edge_dim(0),
+									result.edgeWeights,
+									result.nnz * sizeof(LocalType),
+									cudaMemcpyDefault);
+				blockVector[i] = csrGraph;
+				result.Destroy();
+			}
+			else {
+				MultiValuedCsrGraph<LocalType, ValueType>* csrGraph = new MultiValuedCsrGraph<LocalType,
+						ValueType>((size_t) descr.getOffset(), (size_t) 0, stream);
+				cudaMemset(	csrGraph->get_raw_row_offsets(),
+								0,
+								sizeof(LocalType) * (descr.getOffset() + 1));
+				blockVector[i] = csrGraph;
+			}
+		}
+
+		// Free temporary memory
+		for (int i = 0; i < blockCount; i++) {
+			free(blockRowIds[i]);
+			free(blockColIds[i]);
+			if (values)
+				free(blockValues[i]);
+		}
+		free(blockRowIds);
+		free(blockColIds);
+		if (values)
+			free(blockValues);
+
+		cudaSetDevice(current_device);
+
+		// Put it all together into a Matrix2d object for return
+		return Matrix2d<GlobalType, LocalType, ValueType>(descr, blockVector);
+	}
+}
diff --git a/cpp/nvgraph/cpp/include/app/nvlouvain_app.cu b/cpp/nvgraph/cpp/include/app/nvlouvain_app.cu
new file mode 100644
index 00000000000..b29acf1961d
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/app/nvlouvain_app.cu
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <string>
+#include <cstring>
+#include <vector>
+#include <cmath>
+#include "test_opt_utils.cuh"
+#include "graph_utils.cuh"
+
+//#define ENABLE_LOG TRUE
+#define ENALBE_LOUVAIN true
+
+#include "nvlouvain.cuh"
+#include "gtest/gtest.h"
+#include "high_res_clock.h"
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+#include <cuda.h>
+#include <cuda_profiler_api.h>
+using T = float;
+
+int main(int argc, char* argv[]){
+
+  if(argc < 2)
+  {
+    std::cout<< "Help : ./louvain_test matrix_market_file.mtx"<<std::endl;
+    return 1;
+  }
+  FILE* fin = std::fopen( argv[1] ,"r");
+  int m, k, nnz;
+  MM_typecode mc;
+
+  CUDA_CALL(cudaSetDevice(0));
+
+  EXPECT_EQ((mm_properties<int>(fin, 1, &mc, &m, &k, &nnz)) ,0);
+  EXPECT_EQ(m,k);  
+
+  thrust::host_vector<int> coo_ind_h(nnz);
+  thrust::host_vector<int> csr_ptr_h(m+1);
+  thrust::host_vector<int> csr_ind_h(nnz);
+  thrust::host_vector<T> csr_val_h(nnz);
+
+  EXPECT_EQ( (mm_to_coo<int,T>(fin, 1, nnz, &coo_ind_h[0], &csr_ind_h[0], &csr_val_h[0], NULL)), 0);
+  EXPECT_EQ( (coo_to_csr<int,T> (m, k, nnz, &coo_ind_h[0], &csr_ind_h[0], &csr_val_h[0], NULL, &csr_ptr_h[0], NULL, NULL, NULL)), 0);
+
+  EXPECT_EQ(fclose(fin),0); 
+
+  thrust::device_vector<int> csr_ptr_d(csr_ptr_h);
+  thrust::device_vector<int> csr_ind_d(csr_ind_h);
+  thrust::device_vector<T> csr_val_d(csr_val_h);
+  
+  thrust::device_vector<T> tmp_1(nnz);
+  thrust::fill(thrust::cuda::par, tmp_1.begin(), tmp_1.end(), 1.0);
+  thrust::device_vector<T>::iterator max_ele = thrust::max_element(thrust::cuda::par, csr_val_d.begin(), csr_val_d.end());
+
+  bool weighted = (*max_ele!=1.0);
+
+  //std::cout<<(weighted?"Weighted ":"Not Weigthed ")<<" n_vertex: "<<m<<"\n";
+
+  HighResClock hr_clock;
+  double louvain_time;
+  if(ENALBE_LOUVAIN){
+    T final_modulartiy(0);    
+    //bool record = true;
+    bool has_init_cluster = false;
+    int *clustering_h = (int*)malloc(m*sizeof(int));
+    thrust::device_vector<int> cluster_d(m, 0);
+    int* csr_ptr_ptr = thrust::raw_pointer_cast(csr_ptr_d.data());
+    int* csr_ind_ptr = thrust::raw_pointer_cast(csr_ind_d.data());
+    T* csr_val_ptr = thrust::raw_pointer_cast(csr_val_d.data());      
+    int* init_cluster_ptr = thrust::raw_pointer_cast(cluster_d.data());
+    int num_level;
+    
+    cudaProfilerStart(); 
+    hr_clock.start(); 
+    nvlouvain::louvain<int,T>(csr_ptr_ptr, csr_ind_ptr, csr_val_ptr,
+                            m, nnz, 
+                            weighted, has_init_cluster,  
+                            init_cluster_ptr, final_modulartiy, clustering_h, num_level);
+
+    hr_clock.stop(&louvain_time);
+    cudaProfilerStop();
+
+    std::cout<<"Final modularity: "<<COLOR_MGT<<final_modulartiy<<COLOR_WHT<<" num_level: "<<num_level<<std::endl;
+    std::cout<<"louvain total runtime:"<<louvain_time/1000<<" ms\n"; 
+  }
+  return 0;
+}
+
diff --git a/cpp/nvgraph/cpp/include/app/nvlouvain_app_hierarchy.cu b/cpp/nvgraph/cpp/include/app/nvlouvain_app_hierarchy.cu
new file mode 100644
index 00000000000..70d04e6b898
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/app/nvlouvain_app_hierarchy.cu
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <string>
+#include <cstring>
+#include <vector>
+#include <cmath>
+#include "test_opt_utils.cuh"
+#include "graph_utils.cuh"
+
+//#define ENABLE_LOG true
+#define ENALBE_LOUVAIN true
+
+#include "nvlouvain.cuh"
+#include "gtest/gtest.h"
+#include "high_res_clock.h"
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+#include <cuda.h>
+#include <cuda_profiler_api.h>
+using T = double;
+
+int main(int argc, char* argv[]){
+
+  if(argc < 2)
+  {
+    std::cout<< "Help : ./louvain_test matrix_market_file.mtx"<<std::endl;
+    return 1;
+  }
+  FILE* fin = std::fopen( argv[1] ,"r");
+  int m, k, nnz;
+  MM_typecode mc;
+
+  CUDA_CALL(cudaSetDevice(0));
+
+  EXPECT_EQ((mm_properties<int>(fin, 1, &mc, &m, &k, &nnz)) ,0);
+  EXPECT_EQ(m,k);  
+
+  thrust::host_vector<int> coo_ind_h(nnz);
+  thrust::host_vector<int> csr_ptr_h(m+1);
+  thrust::host_vector<int> csr_ind_h(nnz);
+  thrust::host_vector<T> csr_val_h(nnz);
+
+  EXPECT_EQ( (mm_to_coo<int,T>(fin, 1, nnz, &coo_ind_h[0], &csr_ind_h[0], &csr_val_h[0], NULL)), 0);
+  EXPECT_EQ( (coo_to_csr<int,T> (m, k, nnz, &coo_ind_h[0], &csr_ind_h[0], &csr_val_h[0], NULL, &csr_ptr_h[0], NULL, NULL, NULL)), 0);
+
+  EXPECT_EQ(fclose(fin),0); 
+
+  thrust::device_vector<int> csr_ptr_d(csr_ptr_h);
+  thrust::device_vector<int> csr_ind_d(csr_ind_h);
+  thrust::device_vector<T> csr_val_d(csr_val_h);
+  
+  thrust::device_vector<T> tmp_1(nnz);
+  thrust::fill(thrust::cuda::par, tmp_1.begin(), tmp_1.end(), 1.0);
+  thrust::device_vector<T>::iterator max_ele = thrust::max_element(thrust::cuda::par, csr_val_d.begin(), csr_val_d.end());
+
+  bool weighted = (*max_ele!=1.0);
+
+  //std::cout<<(weighted?"Weighted ":"Not Weigthed ")<<" n_vertex: "<<m<<"\n";
+
+  HighResClock hr_clock;
+  double louvain_time;
+  if(ENALBE_LOUVAIN){
+    T final_modulartiy(0);    
+    //bool record = true;
+    bool has_init_cluster = false;
+    thrust::device_vector<int> cluster_d(m, 0);
+    std::vector< std::vector<int> > best_cluster_vec; 
+    int* csr_ptr_ptr = thrust::raw_pointer_cast(csr_ptr_d.data());
+    int* csr_ind_ptr = thrust::raw_pointer_cast(csr_ind_d.data());
+    T* csr_val_ptr = thrust::raw_pointer_cast(csr_val_d.data());      
+    int* init_cluster_ptr = thrust::raw_pointer_cast(cluster_d.data());
+    int num_level;
+    
+    cudaProfilerStart(); 
+    hr_clock.start(); 
+
+    nvlouvain::louvain<int,T>(csr_ptr_ptr, csr_ind_ptr, csr_val_ptr,
+                            m, nnz, 
+                            weighted, has_init_cluster,  
+                            init_cluster_ptr, final_modulartiy, best_cluster_vec, num_level);
+
+    hr_clock.stop(&louvain_time);
+    cudaProfilerStop();
+
+    std::cout<<"Final modularity: "<<COLOR_MGT<<final_modulartiy<<COLOR_WHT<<" num_level: "<<num_level<<std::endl;
+    std::cout<<"louvain total runtime:"<<louvain_time/1000<<" ms\n"; 
+
+    //for (size_t i = 0; i < best_cluster_vec.size(); i++)
+    //{
+    //    for(std::vector<int>::iterator it = best_cluster_vec[i].begin(); it != best_cluster_vec[i].end(); ++it)
+    //        std::cout << *it <<' ';
+    //    std::cout << std::endl;
+    //}
+  }
+  return 0;
+}
+
diff --git a/cpp/nvgraph/cpp/include/app/nvlouvain_sample.cu b/cpp/nvgraph/cpp/include/app/nvlouvain_sample.cu
new file mode 100644
index 00000000000..790a4788b6f
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/app/nvlouvain_sample.cu
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda_runtime.h>
+// Turn on to see stats for each level
+//#define ENABLE_LOG true
+#include "nvlouvain.cuh"
+
+
+
+/* Louvain Clustering Sample
+
+Social network example: Zachary Karate Club 
+W. Zachary, “An information flow model for conflict and fission in small groups,” Journal of Anthropological Research, vol. 33, pp. 452–473, 1977
+https://en.wikipedia.org/wiki/Zachary's_karate_club
+--------------------------------------------------------------------
+V = 34
+E = 78 bidirectional, 156 directed edges
+
+Bidirectional edges list:
+[2 1] [3 1] [3 2] [4 1] [4 2] [4 3] [5 1] [6 1] [7 1] [7 5] [7 6] [8 1] [8 2] [8 3] [8 4] [9 1] [9 3] [10 3] [11 1] [11 5] [11 6] [12 1] [13 1] [13 4] [14 1] [14 2] [14 3] [14 4] [17 6] [17 7] 
+[18 1] [18 2] [20 1] [20 2] [22 1] [22 2] [26 24] [26 25] [28 3] [28 24] [28 25] [29 3] [30 24] [30 27] [31 2] [31 9] [32 1] [32 25] [32 26] [32 29] [33 3] [33 9] [33 15] [33 16] 
+[33 19] [33 21] [33 23] [33 24] [33 30] [33 31] [33 32] [34 9] [34 10] [34 14] [34 15] [34 16] [34 19] [34 20] [34 21] [34 23] [34 24] [34 27] [34 28] [34 29] [34 30] [34 31] 
+[34 32] [34 33]
+
+CSR representation (directed):
+csrRowPtrA_h {0, 16, 25, 35, 41, 44, 48, 52, 56, 61, 63, 66, 67, 69, 74, 76, 78, 80, 82, 84, 87, 89, 91, 93, 98, 101, 104, 106, 110, 113, 117, 121, 127, 139, 156}
+csrColIndA_h {1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 17, 19, 21, 31, 0, 2, 3, 7, 13, 17, 19, 21, 30, 0, 1, 3, 7, 8, 9, 13, 27, 28, 32, 0, 1, 2, 7, 12, 13, 0, 6, 10, 0, 6, 10, 16, 0, 
+4, 5, 16, 0, 1, 2, 3, 0, 2, 30, 32, 33, 2, 33, 0, 4, 5, 0, 0, 3, 0, 1, 2, 3, 33, 32, 33, 32, 33, 5, 6, 0, 1, 32, 33, 0, 1, 33, 32, 33, 0, 1, 32, 33, 25, 27, 29, 32, 33, 25, 27, 31, 23, 
+24, 31, 29, 33, 2, 23, 24, 33, 2, 31, 33, 23, 26, 32, 33, 1, 8, 32, 33, 0, 24, 25, 28, 32, 33, 2, 8, 14, 15, 18, 20, 22, 23, 29, 30, 31, 33, 8, 9, 13, 14, 15, 18, 19, 20, 22, 23, 
+26, 27, 28, 29, 30, 31, 32}
+csrValA_h {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 
+1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 
+1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 
+1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 
+1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}
+
+--------------------------------------------------------------------
+
+Operation: Louvain Clustering  default parameters in modularity maximization
+
+--------------------------------------------------------------------
+
+Expected output: 
+This sample prints the modlarity score and compare against the python reference (https://python-louvain.readthedocs.io/en/latest/api.html)
+
+
+*/
+
+using namespace nvlouvain;
+
+void check_status(nvlouvainStatus_t status)
+{
+    if ((int)status != 0)
+    {
+        printf("ERROR : %s\n",nvlouvainStatusGetString(status));
+        exit(0);
+    }
+}
+
+int main(int argc, char **argv)
+{
+    // Hard-coded Zachary Karate Club network input
+    int csrRowPtrA_input [] = {0, 16, 25, 35, 41, 44, 48, 52, 56, 61, 63, 66, 67, 69, 74, 76, 78, 80, 82, 84, 87, 89, 91, 93, 98, 101, 104, 106, 110, 113, 117, 121, 127, 
+        139, 156};
+    int csrColIndA_input [] = {1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 17, 19, 21, 31, 0, 2, 3, 7, 13, 17, 19, 21, 30, 0, 1, 3, 7, 8, 9, 13, 27, 28, 32, 0, 1, 2, 7, 12, 13, 0, 6, 10, 0, 
+        6, 10, 16, 0, 4, 5, 16, 0, 1, 2, 3, 0, 2, 30, 32, 33, 2, 33, 0, 4, 5, 0, 0, 3, 0, 1, 2, 3, 33, 32, 33, 32, 33, 5, 6, 0, 1, 32, 33, 0, 1, 33, 32, 33, 0, 1, 32, 33, 25, 27, 29, 32, 33, 
+        25, 27, 31, 23, 24, 31, 29, 33, 2, 23, 24, 33, 2, 31, 33, 23, 26, 32, 33, 1, 8, 32, 33, 0, 24, 25, 28, 32, 33, 2, 8, 14, 15, 18, 20, 22, 23, 29, 30, 31, 33, 8, 9, 13, 14, 15, 
+        18, 19, 20, 22, 23, 26, 27, 28, 29, 30, 31, 32};
+    float csrValA_input [] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 
+        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 
+        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 
+        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 
+        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+    int ref_clustering [] = {0, 0, 0, 0, 1, 1, 1, 0, 2, 0, 1, 0, 0, 0, 2, 2, 1, 0, 2, 0, 2, 0, 2, 3, 3, 3, 2, 3, 3, 2, 2, 3, 2, 2};
+    int *csrRowPtrA_h = &csrRowPtrA_input[0];
+    int *csrColIndA_h = &csrColIndA_input[0];
+    float *csrValA_h = &csrValA_input[0];
+    
+    // Variables
+    const size_t  n = 34, nnz = 156;
+    bool weighted = false;
+    bool has_init_cluster = false;    
+    int *clustering_h, *init_cluster_ptr = nullptr;;
+    int num_levels = 0, hits =0;
+    float final_modulartiy = 0; 
+    // Allocate host data for nvgraphSpectralClustering output
+    clustering_h = (int*)malloc(n*sizeof(int));
+     
+    //Solve clustering with modularity maximization algorithm
+    check_status(louvain<int,float>(csrRowPtrA_h, csrColIndA_h, csrValA_h, n, nnz, weighted, has_init_cluster, init_cluster_ptr, final_modulartiy, clustering_h, num_levels));
+
+    //Print quality (modualrity)
+    printf("Modularity_score: %f\n", final_modulartiy);
+    printf("num levels: %d\n", num_levels);
+    for (int i = 0; i < (int)n; i++)
+        if (clustering_h[i] == ref_clustering[i])
+            hits++;
+    printf("Hit rate : %f%% (%d hits)\n", (hits*100.0)/n, hits);
+    // Print the clustering vector in csv format
+    //for (int i = 0; i < (int)(n-1); i++)
+    //    printf("%d,",clustering_h[i]);
+    //printf("%d,\n",clustering_h[n-1]);
+    free(clustering_h);
+    printf("Done!\n");
+
+    return EXIT_SUCCESS;
+}
+
diff --git a/cpp/nvgraph/cpp/include/app/nvlouvain_sample_hierarchy.cu b/cpp/nvgraph/cpp/include/app/nvlouvain_sample_hierarchy.cu
new file mode 100644
index 00000000000..d39551d768f
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/app/nvlouvain_sample_hierarchy.cu
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda_runtime.h>
+// Turn on to see stats for each level
+//#define ENABLE_LOG true
+#include "nvlouvain.cuh"
+
+
+
+/* Louvain Clustering Sample
+
+
+Social network example: Zachary Karate Club 
+W. Zachary, “An information flow model for conflict and fission in small groups,” Journal of Anthropological Research, vol. 33, pp. 452–473, 1977
+https://en.wikipedia.org/wiki/Zachary's_karate_club
+--------------------------------------------------------------------
+V = 34
+E = 78 bidirectional, 156 directed edges
+
+Bidirectional edges list:
+[2 1] [3 1] [3 2] [4 1] [4 2] [4 3] [5 1] [6 1] [7 1] [7 5] [7 6] [8 1] [8 2] [8 3] [8 4] [9 1] [9 3] [10 3] [11 1] [11 5] [11 6] [12 1] [13 1] [13 4] [14 1] [14 2] [14 3] [14 4] [17 6] [17 7] 
+[18 1] [18 2] [20 1] [20 2] [22 1] [22 2] [26 24] [26 25] [28 3] [28 24] [28 25] [29 3] [30 24] [30 27] [31 2] [31 9] [32 1] [32 25] [32 26] [32 29] [33 3] [33 9] [33 15] [33 16] 
+[33 19] [33 21] [33 23] [33 24] [33 30] [33 31] [33 32] [34 9] [34 10] [34 14] [34 15] [34 16] [34 19] [34 20] [34 21] [34 23] [34 24] [34 27] [34 28] [34 29] [34 30] [34 31] 
+[34 32] [34 33]
+
+CSR representation (directed):
+csrRowPtrA_h {0, 16, 25, 35, 41, 44, 48, 52, 56, 61, 63, 66, 67, 69, 74, 76, 78, 80, 82, 84, 87, 89, 91, 93, 98, 101, 104, 106, 110, 113, 117, 121, 127, 139, 156}
+csrColIndA_h {1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 17, 19, 21, 31, 0, 2, 3, 7, 13, 17, 19, 21, 30, 0, 1, 3, 7, 8, 9, 13, 27, 28, 32, 0, 1, 2, 7, 12, 13, 0, 6, 10, 0, 6, 10, 16, 0, 
+4, 5, 16, 0, 1, 2, 3, 0, 2, 30, 32, 33, 2, 33, 0, 4, 5, 0, 0, 3, 0, 1, 2, 3, 33, 32, 33, 32, 33, 5, 6, 0, 1, 32, 33, 0, 1, 33, 32, 33, 0, 1, 32, 33, 25, 27, 29, 32, 33, 25, 27, 31, 23, 
+24, 31, 29, 33, 2, 23, 24, 33, 2, 31, 33, 23, 26, 32, 33, 1, 8, 32, 33, 0, 24, 25, 28, 32, 33, 2, 8, 14, 15, 18, 20, 22, 23, 29, 30, 31, 33, 8, 9, 13, 14, 15, 18, 19, 20, 22, 23, 
+26, 27, 28, 29, 30, 31, 32}
+csrValA_h {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 
+1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 
+1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 
+1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 
+1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}
+
+--------------------------------------------------------------------
+
+Operation: Louvain Clustering  default parameters in modularity maximization
+
+--------------------------------------------------------------------
+
+Expected output: 
+This sample prints the modlarity score
+
+*/
+
+using namespace nvlouvain;
+
+void check_status(nvlouvainStatus_t status)
+{
+    if ((int)status != 0)
+    {
+        printf("ERROR : %s\n",nvlouvainStatusGetString(status));
+        exit(0);
+    }
+}
+
+int main(int argc, char **argv)
+{
+    // Hard-coded Zachary Karate Club network input
+    int csrRowPtrA_input [] = {0, 16, 25, 35, 41, 44, 48, 52, 56, 61, 63, 66, 67, 69, 74, 76, 78, 80, 82, 84, 87, 89, 91, 93, 98, 101, 104, 106, 110, 113, 117, 121, 127, 
+        139, 156};
+    int csrColIndA_input [] = {1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 17, 19, 21, 31, 0, 2, 3, 7, 13, 17, 19, 21, 30, 0, 1, 3, 7, 8, 9, 13, 27, 28, 32, 0, 1, 2, 7, 12, 13, 0, 6, 10, 0, 
+        6, 10, 16, 0, 4, 5, 16, 0, 1, 2, 3, 0, 2, 30, 32, 33, 2, 33, 0, 4, 5, 0, 0, 3, 0, 1, 2, 3, 33, 32, 33, 32, 33, 5, 6, 0, 1, 32, 33, 0, 1, 33, 32, 33, 0, 1, 32, 33, 25, 27, 29, 32, 33, 
+        25, 27, 31, 23, 24, 31, 29, 33, 2, 23, 24, 33, 2, 31, 33, 23, 26, 32, 33, 1, 8, 32, 33, 0, 24, 25, 28, 32, 33, 2, 8, 14, 15, 18, 20, 22, 23, 29, 30, 31, 33, 8, 9, 13, 14, 15, 
+        18, 19, 20, 22, 23, 26, 27, 28, 29, 30, 31, 32};
+    float csrValA_input [] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 
+        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 
+        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 
+        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 
+        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+//    int ref_clustering [] = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    int *csrRowPtrA_h = &csrRowPtrA_input[0];
+    int *csrColIndA_h = &csrColIndA_input[0];
+    float *csrValA_h = &csrValA_input[0];
+    
+    // Variables
+    const size_t  n = 34, nnz = 156;
+    bool weighted = false;
+    bool has_init_cluster = false;    
+    int num_levels = 0;
+    int *init_cluster_ptr = nullptr;
+    float final_modulartiy = 0; 
+    std::vector< std::vector<int> > best_cluster_vec; 
+   
+    //Solve clustering with modularity maximization algorithm
+    check_status(louvain<int,float>(csrRowPtrA_h, csrColIndA_h, csrValA_h, n, nnz, weighted, has_init_cluster, init_cluster_ptr, final_modulartiy, best_cluster_vec, num_levels));
+
+    //Print quality (modualrity)
+    printf("Modularity_score: %f\n", final_modulartiy);
+    printf("num levels: %d\n", num_levels);
+    printf("Done!\n");
+    
+    //for (size_t i = 0; i < best_cluster_vec.size(); i++)
+    //{
+    //    for(std::vector<int>::iterator it = best_cluster_vec[i].begin(); it != best_cluster_vec[i].end(); ++it)
+    //        std::cout << *it <<' ';
+    //    std::cout << std::endl;
+    //}
+
+    return EXIT_SUCCESS;
+}
diff --git a/cpp/nvgraph/cpp/include/arnoldi.hxx b/cpp/nvgraph/cpp/include/arnoldi.hxx
new file mode 100644
index 00000000000..9b5163fc294
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/arnoldi.hxx
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <vector>   
+
+namespace nvgraph
+{
+
+template <typename IndexType_, typename ValueType_>
+class ImplicitArnoldi
+{
+public: 
+    typedef IndexType_ IndexType;
+    typedef ValueType_ ValueType;
+
+private:
+    //Arnoldi
+    ValuedCsrGraph <IndexType, ValueType> m_A ;//device
+    std::vector<ValueType*> m_Vi;        // Host vector of device adresses -> no it is a 2D vect
+    Vector<ValueType> m_V;               // Each colum is a vector of size n, colum major storage  
+    Vector<ValueType> m_Q_d;             // Device version of Q (Qt)
+    Vector<ValueType> m_V_tmp;           // Output of V*Q <=> QtVt
+    Vector<ValueType> m_ritz_eigenvectors_d;
+    Vector<ValueType> m_eigenvectors;
+    std::vector<ValueType> m_H;                //host
+    std::vector<ValueType> m_H_select;                //host
+    std::vector<ValueType> m_H_tmp;            //host (lapack likes to overwrite input)
+    std::vector<ValueType> m_ritz_eigenvalues; //host
+    std::vector<ValueType> m_ritz_eigenvalues_i; //host
+    std::vector<ValueType> m_shifts;           //host 
+    std::vector<ValueType> m_ritz_eigenvectors;//host
+    std::vector<ValueType> m_Q;                //host
+    std::vector<ValueType> m_Q_tmp;            //host (lapack likes to overwrite input)
+    std::vector<ValueType> m_mns_residuals;      //host resuals of subspaces
+    std::vector<ValueType> m_mns_beta;      //host resuals of subspaces
+
+    Vector <ValueType> m_a; // Markov
+    Vector <ValueType> m_b; // Markov
+    Vector <ValueType> m_D; // Laplacian
+    
+    ValueType m_beta;     // from arnoldi projection algorithm 
+    ValueType m_residual; // is set by compute_residual()
+    ValueType m_damping; // for Markov and Pagerank
+
+    float m_tolerance;
+
+    int m_nr_eigenvalues; // the number of wanted eigenvals, also called k in the litterature
+    int m_n_eigenvalues; // the number of  eigenvals we keep in the solver, this greater or equal to k, this can be m_nr_eigenvalues or m_nr_eigenvalues+1
+    int m_krylov_size;   // the maximum size of the krylov sobspace, also called m in the litterature (m=k+p)
+    int m_iterations;    // a counter of restart, each restart cost m_krylov_size-m_n_eigenvalues arnoldi iterations (~spmv)
+    int m_max_iter; // maximum number of iterations
+    
+    int m_parts; // laplacian related
+
+    //miramns related ints
+    int m_nested_subspaces;     // the number of subspace to evaluate in MIRAMns
+    int m_nested_subspaces_freq;     // the frequence at which we should evaluate subspaces in MIRAMns
+    int m_select;        // best subspace size
+    int m_select_idx; // best subspace number (0 indexed)
+    int m_safety_lower_bound;   // The smallest subspace to check is m_safety_lower_bound+m_nr_eigenvalues+1
+
+    bool m_converged;   
+    bool m_is_setup;     
+    bool m_has_guess;    
+    bool m_markov; 
+    bool m_miramns; 
+    bool m_dirty_bit; // to know if H has changed, so if we need to call geev 
+    bool m_laplacian;
+    bool has_init_guess;
+
+    // Warning : here an iteration is a restart 
+    bool solve_it();
+
+    //  Input:  A V[0]
+    //  Output: V, H, f(=V[m_krylov_size])
+    bool solve_arnoldi(int lower_bound, int upper_bound);
+
+    //  Input:  H - a real square upper Hessenberg matrix
+    //  Output: w - eigenvalues of H sorted according to which
+    //              most wanted to least wanted order     
+    //  Optionally compute the eigenvalues of H
+    void select_shifts(bool dirty_bit=false);
+
+    // reorder eigenpairs by largest real part
+    void LR(int subspace_sz);
+
+    // reorder eigenpairs by largest magnitude
+    void LM(int subspace_sz);
+
+    // reorder eigenpairs by smallest real part
+    void SR(int subspace_sz);
+
+    //   Input: Q       -- a real square orthogonal matrix
+    //          H       -- a real square upper Hessenberg matrix
+    //          mu      -- a real shift
+    //   Output: Q+     -- a real orthogonal matrix  
+    //           H+     -- a real square upper Hessenberg matrix
+    // This step will "refine" the subspace by "pushing" the information 
+    // into the top left corner
+    void qr_step();
+    
+    // Update V and f using Q+ and H+
+    void refine_basis();
+
+    // Approximate residual of the largest Ritz pair of H
+    // Optionally compute the eigenvalues of H
+    void compute_residual(int subspace_size, bool dirty_bit=false);
+    
+    void compute_eigenvectors();
+
+    void select_subspace();
+
+    // extract H_select from H
+    void extract_subspace(int m);
+
+    // clean everything outside of the new_sz*new_sz hessenberg matrix (in colum major)
+    void cleanup_subspace(std::vector<ValueType_>& v, int ld, int new_sz);
+
+    // clean everything outside of the new_sz*new_sz hessenberg matrix (in colum major)
+    void shift(std::vector<ValueType_>& H, int ld, int m, ValueType mu);
+
+public:
+    // Simple constructor 
+    ImplicitArnoldi(void) {};
+    // Simple destructor
+    ~ImplicitArnoldi(void) {};
+
+    // Create a ImplicitArnoldi Solver 
+    ImplicitArnoldi(const ValuedCsrGraph <IndexType, ValueType>& A);
+
+    // Create a ImplicitArnoldi Solver with support of graph laplacian generation
+    ImplicitArnoldi(const ValuedCsrGraph <IndexType, ValueType>& A, int parts);
+
+    // Create a ImplicitArnoldi Solver with support of damping factor and rank one updates (pagerank, markov ...)
+    ImplicitArnoldi(const ValuedCsrGraph <IndexType, ValueType>& A, Vector<ValueType>& dangling_nodes, const float tolerance, const int max_iter, ValueType alpha=0.95);
+ 
+    void setup( Vector<ValueType>& initial_guess, const int restart_it, const int nEigVals); // public because we want to use and test that directly and/or separately
+
+    // Starting from  V, H, f :
+    // Call the QRstep, project the update, launch the arnlodi with the new base 
+    // and check the quality of the new result 
+    void implicit_restart(); // public because we want to use and test that directly and/or separately
+
+    // The total number of SPMV will be : m_krylov_size + (m_krylov_size-m_n_eigenvalues)*nb_restart
+    NVGRAPH_ERROR solve(const int restart_it, const int nEigVals, 
+                     Vector<ValueType>& initial_guess,
+                     Vector<ValueType>& eigVals,
+                     Vector<ValueType>& eigVecs,
+                     const int n_sub_space=0);
+
+    inline ValueType get_residual() const {return m_residual;}
+    inline int get_iterations() const {return m_iterations;}
+
+    // we use that for tests, unoptimized copies/transfers inside
+    std::vector<ValueType> get_H_copy() {return m_H;}
+    std::vector<ValueType> get_Hs_copy() {return m_H_select;}
+    std::vector<ValueType> get_ritz_eval_copy(){return m_ritz_eigenvalues;} // should be called after select_shifts
+    std::vector<ValueType> get_V_copy();
+    std::vector<ValueType> get_f_copy();
+    std::vector<ValueType> get_fp_copy();
+};
+
+} // end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/include/async_event.cuh b/cpp/nvgraph/cpp/include/async_event.cuh
new file mode 100644
index 00000000000..1f4491645cc
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/async_event.cuh
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+
+class AsyncEvent
+{
+    public:
+        AsyncEvent() : async_event(NULL) { }
+        AsyncEvent(int size) : async_event(NULL) { cudaEventCreate(&async_event); }
+        ~AsyncEvent() { if (async_event != NULL) cudaEventDestroy(async_event); }
+
+        void create() { cudaEventCreate(&async_event); }
+        void record(cudaStream_t s = 0)
+        {
+            if (async_event == NULL)
+            {
+                cudaEventCreate(&async_event);    // check if we haven't created the event yet
+            }
+
+            cudaEventRecord(async_event, s);
+        }
+        void sync()
+        {
+            cudaEventSynchronize(async_event);
+        }
+    private:
+        cudaEvent_t async_event;
+};
+
diff --git a/cpp/nvgraph/cpp/include/async_event.hxx b/cpp/nvgraph/cpp/include/async_event.hxx
new file mode 100644
index 00000000000..a3ad6567734
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/async_event.hxx
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace nvgraph {
+
+  class AsyncEvent {
+    public:
+      AsyncEvent() : async_event(NULL) { }
+      AsyncEvent(int size) : async_event(NULL) { cudaEventCreate(&async_event); }
+      ~AsyncEvent() { if (async_event != NULL) cudaEventDestroy(async_event); }
+
+      void create() { cudaEventCreate(&async_event); }
+      void record(cudaStream_t s=0) {
+        if (async_event == NULL)  
+          cudaEventCreate(&async_event); // check if we haven't created the event yet
+        cudaEventRecord(async_event,s);
+      }
+      void sync() {
+        cudaEventSynchronize(async_event);
+      }
+    private:
+      cudaEvent_t async_event;
+  };
+
+}
+
diff --git a/cpp/nvgraph/cpp/include/atomics.hxx b/cpp/nvgraph/cpp/include/atomics.hxx
new file mode 100644
index 00000000000..4cd02764ed7
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/atomics.hxx
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+namespace nvgraph {
+//This file contains the atomic operations for floats and doubles from cusparse/src/cusparse_atomics.h
+
+static __inline__ __device__ double atomicFPAdd(double *addr, double val)
+{
+// atomicAdd for double starts with sm_60
+#if __CUDA_ARCH__ >= 600
+    return atomicAdd( addr, val );
+#else
+    unsigned long long old = __double_as_longlong( addr[0] ), assumed;
+
+    do
+    {
+        assumed = old;
+        old = atomicCAS( (unsigned long long *) addr, assumed, __double_as_longlong( val + __longlong_as_double( assumed ) ) );
+    }
+    while ( assumed != old );
+
+    return old;
+#endif
+} 
+
+// atomicAdd for float starts with sm_20
+static __inline__ __device__ float atomicFPAdd(float *addr, float val)
+{
+    return atomicAdd( addr, val );
+}
+
+static __inline__ __device__ double atomicFPMin(double *addr, double val)
+{
+    double old, assumed;
+    old=*addr; 
+    do{
+        assumed = old;
+        old     = __longlong_as_double(atomicCAS((unsigned long long int *)addr, __double_as_longlong(assumed),
+                                                 __double_as_longlong(min(val,assumed))));
+    } while (__double_as_longlong(assumed) != __double_as_longlong(old));
+    return old;
+} 
+
+/* atomic addition: based on Nvidia Research atomic's tricks from cusparse */
+static __inline__ __device__ float atomicFPMin(float *addr, float val)
+{       
+    float old, assumed;
+    old=*addr;
+    do{
+        assumed = old;
+        old     = int_as_float(atomicCAS((int *)addr, float_as_int(assumed),float_as_int(min(val,assumed))));
+    } while (float_as_int(assumed) != float_as_int(old));
+
+    return old;
+}
+
+static __inline__ __device__ double atomicFPMax(double *addr, double val)
+{
+    double old, assumed;
+    old=*addr; 
+    do{
+        assumed = old;
+        old     = __longlong_as_double(atomicCAS((unsigned long long int *)addr, __double_as_longlong(assumed),
+                                                 __double_as_longlong(max(val,assumed))));
+    } while (__double_as_longlong(assumed) != __double_as_longlong(old));
+    return old;
+} 
+
+/* atomic addition: based on Nvidia Research atomic's tricks from cusparse */
+static __inline__ __device__ float atomicFPMax(float *addr, float val)
+{       
+    float old, assumed;
+    old=*addr;
+    do{
+        assumed = old;
+        old     = int_as_float(atomicCAS((int *)addr, float_as_int(assumed),float_as_int(max(val,assumed))));
+    } while (float_as_int(assumed) != float_as_int(old));
+
+    return old;
+}
+
+static __inline__ __device__ double atomicFPOr(double *addr, double val)
+{
+    double old, assumed;
+    old=*addr; 
+    do{
+        assumed = old;
+        old     = __longlong_as_double(atomicCAS((unsigned long long int *)addr, __double_as_longlong(assumed),
+                                                 __double_as_longlong((bool)val | (bool)assumed)));
+    } while (__double_as_longlong(assumed) != __double_as_longlong(old));
+    return old;
+} 
+
+/* atomic addition: based on Nvidia Research atomic's tricks from cusparse */
+static __inline__ __device__ float atomicFPOr(float *addr, float val)
+{       
+    float old, assumed;
+    old=*addr;
+    do{
+        assumed = old;
+        old     = int_as_float(atomicCAS((int *)addr, float_as_int(assumed),float_as_int((bool)val | (bool)assumed)));
+    } while (float_as_int(assumed) != float_as_int(old));
+
+    return old;
+}
+
+static __inline__ __device__ double atomicFPLog(double *addr, double val)
+{
+    double old, assumed;
+    old=*addr; 
+    do{
+        assumed = old;
+        old     = __longlong_as_double(atomicCAS((unsigned long long int *)addr, __double_as_longlong(assumed),
+                                                 __double_as_longlong(-log(exp(-val)+exp(-assumed)))));
+    } while (__double_as_longlong(assumed) != __double_as_longlong(old));
+    return old;
+} 
+
+/* atomic addition: based on Nvidia Research atomic's tricks from cusparse */
+static __inline__ __device__ float atomicFPLog(float *addr, float val)
+{       
+    float old, assumed;
+    old=*addr;
+    do{
+        assumed = old;
+        old     = int_as_float(atomicCAS((int *)addr, float_as_int(assumed),float_as_int(-logf(expf(-val)+expf(-assumed)))));
+    } while (float_as_int(assumed) != float_as_int(old));
+
+    return old;
+}
+
+} //end anmespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/include/bfs.hxx b/cpp/nvgraph/cpp/include/bfs.hxx
new file mode 100755
index 00000000000..8cd5f37a8c8
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/bfs.hxx
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+
+#pragma once
+
+
+
+#include <climits> 
+
+
+
+//Used in nvgraph.h
+
+#define TRAVERSAL_DEFAULT_ALPHA 15
+
+#define TRAVERSAL_DEFAULT_BETA 18
+
+
+
+#include "nvgraph_error.hxx"
+
+
+
+namespace nvgraph
+
+{
+
+	template <typename IndexType>
+
+		class Bfs 
+
+		{
+
+			private:
+
+				IndexType n, nnz;
+
+				IndexType* row_offsets;
+
+				IndexType* col_indices;
+
+			
+
+				bool directed;
+				bool deterministic;
+
+
+				// edgemask, distances, predecessors are set/read by users - using Vectors
+
+				bool useEdgeMask;
+
+				bool computeDistances;
+
+				bool computePredecessors; 
+
+				
+
+				IndexType *distances;
+
+				IndexType *predecessors;
+
+				int *edge_mask;
+
+				
+
+				//Working data
+
+				//For complete description of each, go to bfs.cu
+
+
+
+				IndexType nisolated;	
+
+				IndexType *frontier, *new_frontier;	
+
+				IndexType * original_frontier;
+
+				IndexType vertices_bmap_size;
+
+				int *visited_bmap, *isolated_bmap;
+
+				IndexType *vertex_degree;
+
+				IndexType *buffer_np1_1, *buffer_np1_2;
+
+				IndexType *frontier_vertex_degree;
+
+				IndexType *exclusive_sum_frontier_vertex_degree;
+
+				IndexType *unvisited_queue;
+
+				IndexType *left_unvisited_queue; 
+
+				IndexType *exclusive_sum_frontier_vertex_buckets_offsets;
+
+
+
+				IndexType *d_counters_pad;
+
+				IndexType *d_new_frontier_cnt;
+
+				IndexType *d_mu;
+
+				IndexType *d_unvisited_cnt;
+
+				IndexType *d_left_unvisited_cnt;	
+
+			
+
+				void *d_cub_exclusive_sum_storage;
+
+				size_t cub_exclusive_sum_storage_bytes;
+
+	
+
+				//Parameters for direction optimizing
+
+				IndexType alpha, beta; 
+
+			
+
+				cudaStream_t stream;
+
+				//resets pointers defined by d_counters_pad (see implem)
+
+				void resetDevicePointers();
+
+				NVGRAPH_ERROR setup();
+
+				void clean();
+
+			public:
+
+				virtual ~Bfs(void) {
+
+					clean();
+
+				};
+
+
+
+				Bfs(IndexType _n, IndexType _nnz, IndexType *_row_offsets, IndexType *_col_indices, bool _directed, IndexType _alpha, IndexType _beta, cudaStream_t _stream = 0) : n(_n), nnz(_nnz), row_offsets(_row_offsets), col_indices(_col_indices), directed(_directed), alpha(_alpha), beta(_beta), stream(_stream) {
+
+					setup();
+
+				}
+
+
+
+				NVGRAPH_ERROR configure(IndexType *distances, IndexType *predecessors, int *edge_mask);
+
+				NVGRAPH_ERROR traverse(IndexType source_vertex);
+
+				//Used only for benchmarks
+
+				NVGRAPH_ERROR traverse(IndexType *source_vertices, IndexType nsources);
+
+		};
+
+
+
+} // end namespace nvgraph
+
+
+
diff --git a/cpp/nvgraph/cpp/include/bfs2d.hxx b/cpp/nvgraph/cpp/include/bfs2d.hxx
new file mode 100644
index 00000000000..52cc9b2882d
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/bfs2d.hxx
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <climits>
+
+//Used in nvgraph.h
+#define TRAVERSAL_DEFAULT_ALPHA 15
+#define TRAVERSAL_DEFAULT_BETA 18
+
+#include "nvgraph_error.hxx"
+#include "2d_partitioning.h"
+
+namespace nvgraph {
+	template<typename GlobalType, typename LocalType, typename ValueType>
+	class Bfs2d {
+	private:
+		Matrix2d<GlobalType, LocalType, ValueType>* M;
+
+		bool directed;
+		bool deterministic;
+		GlobalType alpha;
+		GlobalType beta;
+
+		// edgemask, distances, predecessors are set/read by users - using Vectors
+		bool useEdgeMask;
+		bool computeDistances;
+		bool computePredecessors;
+		int32_t vertices_bmap_size;
+		VertexData2D<GlobalType, LocalType, LocalType>* distances;
+		VertexData2D<GlobalType, LocalType, GlobalType>* predecessors;
+
+		//Working data
+		VertexData2D<GlobalType, LocalType, int32_t>* frontier_bmap;
+		VertexData2D<GlobalType, LocalType, int32_t>* visited_bmap;
+		VertexData2D_Unbuffered<GlobalType, LocalType, LocalType>* frontier;
+		VertexData2D_Unbuffered<GlobalType, LocalType, LocalType>* trim_frontier;
+		VertexData2D_Unbuffered<GlobalType, LocalType, LocalType>* frontierSize;
+		VertexData2D_Unbuffered<GlobalType, LocalType, int8_t>* degreeFlags;
+		std::vector<LocalType> frontierSize_h;
+		VertexData2D_Unbuffered<GlobalType, LocalType, LocalType>* exSumDegree;
+		VertexData2D_Unbuffered<GlobalType, LocalType, int8_t>* exSumStorage;
+		VertexData2D_Unbuffered<GlobalType, LocalType, LocalType>* bucketOffsets;
+		std::vector<LocalType> frontierDegree_h;
+
+		// Output locations
+		GlobalType* distances_out;
+		GlobalType* predecessors_out;
+
+		NVGRAPH_ERROR setup();
+
+		void clean();
+
+	public:
+		virtual ~Bfs2d(void) {
+			clean();
+		};
+
+		Bfs2d(Matrix2d<GlobalType, LocalType, ValueType>* _M,
+				bool _directed,
+				GlobalType _alpha,
+				GlobalType _beta) :
+						M(_M),
+						directed(_directed),
+						alpha(_alpha),
+						beta(_beta){
+			distances = NULL;
+			predecessors = NULL;
+			frontier_bmap = NULL;
+			visited_bmap = NULL;
+			setup();
+		}
+
+		NVGRAPH_ERROR configure(GlobalType *distances, GlobalType *predecessors);
+
+		NVGRAPH_ERROR traverse(GlobalType source_vertex);
+
+		//Used only for benchmarks
+		NVGRAPH_ERROR traverse(GlobalType *source_vertices, int32_t nsources);
+	};
+} // end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/include/bfs2d_kernels.cuh b/cpp/nvgraph/cpp/include/bfs2d_kernels.cuh
new file mode 100644
index 00000000000..792db1bd5e3
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/bfs2d_kernels.cuh
@@ -0,0 +1,786 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cub/cub.cuh>
+#include "nvgraph_error.hxx"
+
+#define MAXBLOCKS 65535
+#define WARP_SIZE 32
+#define INT_SIZE 32
+#define FILL_QUEUE_DIMX 256
+#define COMPUTE_BUCKET_OFFSETS_DIMX 512
+#define TOP_DOWN_EXPAND_DIMX 256
+#define TOP_DOWN_BUCKET_SIZE 32
+#define NBUCKETS_PER_BLOCK (TOP_DOWN_EXPAND_DIMX/TOP_DOWN_BUCKET_SIZE)
+#define TOP_DOWN_BATCH_SIZE 2
+#define MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD (TOP_DOWN_BUCKET_SIZE - 1)
+
+using namespace nvgraph;
+namespace bfs_kernels {
+
+	struct popCount : public thrust::unary_function<int,int> {
+	  __device__
+	  int operator()(int x) const
+	  {
+	    return __popc(x);
+	  }
+	};
+
+	template<typename >
+	struct vec_t {
+		typedef int4 vec4;
+		typedef int2 vec2;
+	};
+
+	template<>
+	struct vec_t<int> {
+		typedef int4 vec4;
+		typedef int2 vec2;
+		static const int max = INT_MAX;
+	};
+
+	template<>
+	struct vec_t<long long int> {
+		typedef longlong4 vec4;
+		typedef longlong2 vec2;
+		static const long long int max = LLONG_MAX;
+	};
+
+	struct BitwiseOr {
+		template<typename T>
+		__host__  __device__  __forceinline__ T operator()(const T &a, const T &b) const {
+			return (a | b);
+		}
+	};
+
+	struct predMerge {
+		template<typename T>
+		__host__  __device__  __forceinline__ T operator()(const T &a, const T &b) const {
+			if (a != -1 && b != -1)
+				return min(a, b);
+			if (a != -1)
+				return a;
+			if (b != -1)
+				return b;
+			return -1;
+		}
+	};
+
+	__forceinline__ __device__ int getMaskNRightmostBitSet(int n) {
+		if (n == INT_SIZE)
+			return (~0);
+		int mask = (1 << n) - 1;
+		return mask;
+	}
+
+	__forceinline__ __device__ int getMaskNLeftmostBitSet(int n) {
+		if (n == 0)
+			return 0;
+		int mask = ~((1 << (INT_SIZE - n)) - 1);
+		return mask;
+	}
+
+	/**
+	 * Finds the position of the next non-zero bit in the given value. The value is
+	 * re-written with the found bit unset.
+	 * @param val The integer to find the next non-zero bit in.
+	 * @return The position of the next non-zero bit
+	 */
+	__forceinline__ __device__ int getNextNonZeroBit(int32_t& val) {
+		int ibit = __ffs(val) - 1;
+		val &= ~(1 << ibit);
+
+		return ibit;
+	}
+
+	template<typename IndexType>
+	__device__ IndexType binsearch_maxle(const IndexType *vec,
+														const IndexType val,
+														IndexType low,
+														IndexType high) {
+		while (true) {
+			if (low == high)
+				return low; //we know it exists
+			if ((low + 1) == high)
+				return (vec[high] <= val) ? high : low;
+
+			IndexType mid = low + (high - low) / 2;
+
+			if (vec[mid] > val)
+				high = mid - 1;
+			else
+				low = mid;
+
+		}
+	}
+
+	template<typename IndexType>
+	class degreeIterator: public std::iterator<std::input_iterator_tag, IndexType, size_t,
+			IndexType*, IndexType> {
+		IndexType* offsets;
+		size_t pos;
+		public:
+		__host__ __device__ degreeIterator(IndexType* _offsets) :
+				offsets(_offsets), pos(0) {
+		}
+		__host__ __device__ degreeIterator(IndexType* _offsets, size_t _pos) :
+				offsets(_offsets), pos(_pos) {
+		}
+		__host__  __device__ IndexType operator[](int loc) {
+			return offsets[loc + 1] - offsets[loc];
+		}
+		__host__  __device__ IndexType operator*() {
+			return offsets[pos + 1] - offsets[pos];
+		}
+		__host__  __device__ degreeIterator operator+(int inc) {
+			degreeIterator it(offsets, pos + inc);
+			return it;
+		}
+	};
+
+	template<typename IndexType>
+	size_t getCubExclusiveSumStorageSize(IndexType n) {
+		void* d_temp_storage = NULL;
+		size_t temp_storage_bytes = 0;
+		IndexType *d_in = NULL, *d_out = NULL;
+		cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, n);
+		return temp_storage_bytes;
+	}
+
+	template<typename IndexType>
+	size_t getCubSelectFlaggedStorageSize(IndexType n) {
+		void* d_temp_storage = NULL;
+		size_t temp_storage_bytes = 0;
+		IndexType *d_in = NULL, *d_out = NULL, *size_out = NULL;
+		degreeIterator<IndexType> degreeIt(NULL);
+		cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, degreeIt, d_out, size_out, n);
+		return temp_storage_bytes;
+	}
+
+	/**
+	 * Takes in the bitmap frontier and outputs the frontier as a queue of ids.
+	 * @param bmap Pointer to the bitmap
+	 * @param bmap_nints The number of ints used to store the bitmap
+	 * @param n The number of bits in the bitmap
+	 * @param outputQueue Pointer to the output queue
+	 * @param output_cnt Pointer to counter for output size
+	 */
+	template<typename IndexType>
+	__global__ void convert_bitmap_to_queue_kernel(int32_t *bmap,
+																	IndexType bmap_nints,
+																	IndexType n,
+																	IndexType *outputQueue,
+																	IndexType *output_cnt) {
+		typedef cub::BlockScan<int, FILL_QUEUE_DIMX> BlockScan;
+		__shared__ typename BlockScan::TempStorage scan_temp_storage;
+
+		// When filling the output queue, we use output_cnt to know where to write in the queue
+		// (equivalent of int off = atomicAddd(unvisited_cnt, 1)) We will actually do only one
+		// atomicAdd per block - we first do a scan, then call one atomicAdd, and store the common
+		// offset for the block in common_block_offset
+		__shared__ IndexType common_block_offset;
+
+		// We don't want threads divergence in the loop (we're going to call __syncthreads)
+		// Using a block-only dependent in the condition of the loop
+		for (IndexType block_v_idx = blockIdx.x * blockDim.x;
+				block_v_idx < bmap_nints;
+				block_v_idx += blockDim.x * gridDim.x) {
+
+			// Index of bmap that this thread will compute
+			IndexType v_idx = block_v_idx + threadIdx.x;
+
+			int thread_int = (v_idx < bmap_nints) ? bmap[v_idx] : 0;
+
+			// The last int can be only partially valid
+			// If we are indeed taking care of the last int in this thread,
+			// We need to first disable the inactive bits (vertices >= n)
+			if (v_idx == (bmap_nints - 1)) {
+				int active_bits = n - (INT_SIZE * v_idx);
+				int inactive_bits = INT_SIZE - active_bits;
+				int mask = getMaskNLeftmostBitSet(inactive_bits);
+				thread_int &= (~mask);
+			}
+
+			//Counting number of set bits in this int
+			int n_in_int = __popc(thread_int);
+			int thread_offset;
+
+			// We will need to write n_unvisited_in_int unvisited vertices to the unvisited queue
+			// We ask for that space when computing the block scan, that will tell where to write those
+			// vertices in the queue, using the common offset of the block (see below)
+			BlockScan(scan_temp_storage).ExclusiveSum(n_in_int, thread_offset);
+
+			// Last thread knows how many vertices will be written to the queue by this block
+			// Asking for that space in the queue using the global count, and saving the common offset
+			if (threadIdx.x == (FILL_QUEUE_DIMX - 1)) {
+				IndexType total = thread_offset + n_in_int;
+				common_block_offset = atomicAdd(output_cnt, total);
+			}
+
+			// syncthreads for two reasons :
+			// - we need to broadcast common_block_offset
+			// - we will reuse scan_temp_storage (cf CUB doc)
+			__syncthreads();
+
+			IndexType current_index = common_block_offset + thread_offset;
+			int nvertices_to_write = n_in_int;
+
+			// getNextNonZeroBit uses __ffs, which gives least significant bit set
+			// which means that as long as n_unvisited_in_int is valid,
+			// we will use valid bits
+
+			while (nvertices_to_write > 0) {
+				if (nvertices_to_write >= 4 && (current_index % 4) == 0) {
+					typename vec_t<IndexType>::vec4 vec_v;
+
+					vec_v.x = v_idx * INT_SIZE + getNextNonZeroBit(thread_int);
+					vec_v.y = v_idx * INT_SIZE + getNextNonZeroBit(thread_int);
+					vec_v.z = v_idx * INT_SIZE + getNextNonZeroBit(thread_int);
+					vec_v.w = v_idx * INT_SIZE + getNextNonZeroBit(thread_int);
+
+					typename vec_t<IndexType>::vec4 *unvisited_i4 = reinterpret_cast<typename vec_t<
+							IndexType>::vec4*>(&outputQueue[current_index]);
+					*unvisited_i4 = vec_v;
+
+					current_index += 4;
+					nvertices_to_write -= 4;
+				}
+				else if (nvertices_to_write >= 2 && (current_index % 2) == 0) {
+					typename vec_t<IndexType>::vec2 vec_v;
+
+					vec_v.x = v_idx * INT_SIZE + getNextNonZeroBit(thread_int);
+					vec_v.y = v_idx * INT_SIZE + getNextNonZeroBit(thread_int);
+
+					typename vec_t<IndexType>::vec2 *unvisited_i2 = reinterpret_cast<typename vec_t<
+							IndexType>::vec2*>(&outputQueue[current_index]);
+					*unvisited_i2 = vec_v;
+
+					current_index += 2;
+					nvertices_to_write -= 2;
+				} else {
+					IndexType v = v_idx * INT_SIZE + getNextNonZeroBit(thread_int);
+
+					outputQueue[current_index] = v;
+
+					current_index += 1;
+					nvertices_to_write -= 1;
+				}
+
+			}
+		}
+	}
+
+	template<typename IndexType>
+	void convert_bitmap_to_queue(int32_t *bmap,
+											IndexType bmap_nints,
+											IndexType n,
+											IndexType *outputQueue,
+											IndexType *output_cnt,
+											cudaStream_t stream) {
+		dim3 grid, block;
+		block.x = FILL_QUEUE_DIMX;
+		grid.x = min((IndexType) MAXBLOCKS, (bmap_nints + block.x - 1) / block.x);
+		convert_bitmap_to_queue_kernel<<<grid, block, 0, stream>>>(bmap,
+																						bmap_nints,
+																						n,
+																						outputQueue,
+																						output_cnt);
+		cudaCheckError()
+					;
+	}
+
+	/**
+	 * Kernel to compute bucket offsets for load balancing main top-down expand kernel
+	 * @param frontier_degrees_exclusive_sum Exclusive sum of the local degrees of the frontier
+	 * elements.
+	 * @param bucket_offsets Output location for the bucket offsets.
+	 * @param frontier_size Number of elements in the frontier.
+	 * @param total_degree Total local degree of frontier elements.
+	 */
+	template<typename IndexType>
+	__global__ void compute_bucket_offsets_kernel(const IndexType *frontier_degrees_exclusive_sum,
+																	IndexType *bucket_offsets,
+																	const IndexType frontier_size,
+																	IndexType total_degree) {
+		IndexType end = ((total_degree - 1 + TOP_DOWN_EXPAND_DIMX) / TOP_DOWN_EXPAND_DIMX
+				* NBUCKETS_PER_BLOCK + 1);
+
+		for (IndexType bid = blockIdx.x * blockDim.x + threadIdx.x;
+				bid <= end;
+				bid += gridDim.x * blockDim.x) {
+
+			IndexType eid = min(bid * TOP_DOWN_BUCKET_SIZE, total_degree - 1);
+
+			bucket_offsets[bid] = binsearch_maxle(frontier_degrees_exclusive_sum,
+																eid,
+																(IndexType) 0,
+																frontier_size - 1);
+
+		}
+	}
+
+	/**
+	 * Wrapper function around compute_bucket_offsets_kernel.
+	 * @param cumul Exclusive sum of the local degrees of the frontier elements.
+	 * @param bucket_offsets Output location for the bucket offsets.
+	 * @param frontier_size Number of elements in the frontier.
+	 * @param total_degree Total local degree of frontier elements.
+	 * @param m_stream Stream to use for execution.
+	 */
+	template<typename IndexType>
+	void compute_bucket_offsets(IndexType *cumul,
+											IndexType *bucket_offsets,
+											IndexType frontier_size,
+											IndexType total_degree,
+											cudaStream_t m_stream) {
+		dim3 grid, block;
+		block.x = COMPUTE_BUCKET_OFFSETS_DIMX;
+
+		grid.x = min((IndexType) MAXBLOCKS,
+							((total_degree - 1 + TOP_DOWN_EXPAND_DIMX) / TOP_DOWN_EXPAND_DIMX
+									* NBUCKETS_PER_BLOCK + 1 + block.x - 1) / block.x);
+
+		compute_bucket_offsets_kernel<<<grid, block, 0, m_stream>>>(cumul,
+																						bucket_offsets,
+																						frontier_size,
+																						total_degree);
+		cudaCheckError();
+	}
+
+	/**
+	 * Kernel for setting the degree of each frontier element.
+	 * @param frontier_degree Output to store frontier degrees.
+	 * @param frontier The frontier elements.
+	 * @param degreeIt Iterator providing the degree of a given vertex ID
+	 * @param n The number of elements in the frontier.
+	 */
+	template<typename IndexType, typename InputIterator>
+	__global__ void set_frontier_degree_kernel(IndexType *frontier_degree,
+																IndexType *frontier,
+																InputIterator degreeIt,
+																IndexType n) {
+		for (IndexType idx = blockDim.x * blockIdx.x + threadIdx.x;
+				idx < n;
+				idx += gridDim.x * blockDim.x) {
+			IndexType u = frontier[idx];
+			frontier_degree[idx] = degreeIt[u];
+		}
+	}
+
+	/**
+	 * Wrapper function for calling set_frontier_degree_kernel
+	 * @param frontier_degree Output to store frontier degrees.
+	 * @param frontier The frontier elements.
+	 * @param degreeIt Iterator providing the degree of a given vertex ID.
+	 * @param n The number of elements in the frontier.
+	 * @param m_stream The stream to use for the kernel call.
+	 */
+	template<typename IndexType, typename InputIterator>
+	void set_frontier_degree(IndexType *frontier_degree,
+										IndexType *frontier,
+										InputIterator degreeIt,
+										IndexType n,
+										cudaStream_t m_stream) {
+		dim3 grid, block;
+		block.x = 256;
+		grid.x = min((n + block.x - 1) / block.x, (IndexType) MAXBLOCKS);
+		set_frontier_degree_kernel<<<grid, block, 0, m_stream>>>(frontier_degree,
+																					frontier,
+																					degreeIt,
+																					n);
+		cudaCheckError();
+	}
+
+	/**
+	 * Kernel for setting the degree of each frontier element.
+	 * @param frontier_degree Output to store frontier degrees.
+	 * @param frontier The frontier elements.
+	 * @param degreeIt Iterator providing the degree of a given vertex ID
+	 * @param n The number of elements in the frontier.
+	 */
+	template<typename IndexType, typename InputIterator>
+	__global__ void set_degree_flags_kernel(int8_t *degree_flags,
+														 IndexType *frontier,
+														 InputIterator degreeIt,
+														 IndexType n) {
+		for (IndexType idx = blockDim.x * blockIdx.x + threadIdx.x;
+				idx < n;
+				idx += gridDim.x * blockDim.x) {
+			IndexType u = frontier[idx];
+			degree_flags[idx] = (degreeIt[u] == 0) ? 0 : 1;
+		}
+	}
+
+	/**
+	 * Wrapper function for calling set_frontier_degree_kernel
+	 * @param frontier_degree Output to store frontier degrees.
+	 * @param frontier The frontier elements.
+	 * @param degreeIt Iterator providing the degree of a given vertex ID.
+	 * @param n The number of elements in the frontier.
+	 * @param m_stream The stream to use for the kernel call.
+	 */
+	template<typename IndexType, typename InputIterator>
+	void set_degree_flags(int8_t *degree_flags,
+								 IndexType *frontier,
+								 InputIterator degreeIt,
+								 IndexType n,
+								 cudaStream_t m_stream) {
+		dim3 grid, block;
+		block.x = 256;
+		grid.x = min((n + block.x - 1) / block.x, (IndexType) MAXBLOCKS);
+		set_degree_flags_kernel<<<grid, block, 0, m_stream>>>(degree_flags,
+																				frontier,
+																				degreeIt,
+																				n);
+		cudaCheckError();
+	}
+
+	/**
+	 * Kernel for globalizing an array of ids using a given offset. Values of -1 remain
+	 * unchanged, other values are incremented by the offset.
+	 * @param ids The array of ids to globalize (input and output)
+	 * @param offset The offset to be applied to each id.
+	 * @param n The number of ids in the array.
+	 */
+	template<typename IndexType>
+	__global__ void globalize_ids_kernel(IndexType *ids,
+													 IndexType offset,
+													 IndexType n) {
+		for (IndexType idx = blockDim.x * blockIdx.x + threadIdx.x;
+				idx < n;
+				idx += gridDim.x * blockDim.x) {
+			IndexType id = ids[idx];
+			ids[idx] = (id == -1) ? -1 : id + offset;
+		}
+	}
+
+	/**
+	 * Wrapper function for calling globalize_ids_kernel
+	 * @param ids The array of ids to globalize (input and output)
+	 * @param offset The offset to be applied to each id.
+	 * @param n The number of ids in the array.
+	 * @param m_stream The stream to use for the kernel call.
+	 */
+	template<typename IndexType>
+	void globalize_ids(IndexType *ids,
+							 IndexType offset,
+							 IndexType n,
+							 cudaStream_t m_stream) {
+		dim3 grid, block;
+		block.x = 256;
+		grid.x = min((n + block.x - 1) / block.x, (IndexType) MAXBLOCKS);
+		globalize_ids_kernel<<<grid, block, 0, m_stream>>>(ids, offset, n);
+		cudaCheckError();
+	}
+
+	template<typename IndexType, typename GlobalType>
+	__global__ void topdown_expand_kernel(	const IndexType *row_ptr,
+														const IndexType *col_ind,
+														const IndexType *frontier,
+														const IndexType frontier_size,
+														const IndexType totaldegree,
+														const IndexType max_items_per_thread,
+														const IndexType lvl,
+														int *frontier_bmap,
+														const IndexType *frontier_degrees_exclusive_sum,
+														const IndexType *frontier_degrees_exclusive_sum_buckets_offsets,
+														int *visited_bmap,
+														IndexType *distances,
+														GlobalType *predecessors) {
+		__shared__ IndexType shared_buckets_offsets[TOP_DOWN_EXPAND_DIMX - NBUCKETS_PER_BLOCK + 1];
+		__shared__ IndexType shared_frontier_degrees_exclusive_sum[TOP_DOWN_EXPAND_DIMX + 1];
+
+		IndexType block_offset = (blockDim.x * blockIdx.x) * max_items_per_thread;
+		IndexType n_items_per_thread_left = (totaldegree - block_offset + TOP_DOWN_EXPAND_DIMX - 1)
+				/ TOP_DOWN_EXPAND_DIMX;
+
+//		if (threadIdx.x == 0)
+//			printf("n_items_per_thread_left=%d max_items_per_thread=%d\n", n_items_per_thread_left, max_items_per_thread);
+		n_items_per_thread_left = min(max_items_per_thread, n_items_per_thread_left);
+
+		for (;
+				(n_items_per_thread_left > 0) && (block_offset < totaldegree);
+				block_offset += MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD * blockDim.x,
+						n_items_per_thread_left -= MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD) {
+
+			// In this loop, we will process batch_set_size batches
+			IndexType nitems_per_thread = min(n_items_per_thread_left,
+															(IndexType) MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD);
+
+			// Loading buckets offset (see compute_bucket_offsets_kernel)
+
+			if (threadIdx.x < (nitems_per_thread * NBUCKETS_PER_BLOCK + 1))
+				shared_buckets_offsets[threadIdx.x] =
+						frontier_degrees_exclusive_sum_buckets_offsets[block_offset / TOP_DOWN_BUCKET_SIZE
+								+ threadIdx.x];
+
+			// We will use shared_buckets_offsets
+			__syncthreads();
+
+			//
+			// shared_buckets_offsets gives us a range of the possible indexes
+			// for edge of linear_threadx, we are looking for the value k such as
+			// k is the max value such as frontier_degrees_exclusive_sum[k] <= linear_threadx
+			//
+			// we have 0 <= k < frontier_size
+			// but we also have :
+			//
+			// frontier_degrees_exclusive_sum_buckets_offsets[linear_threadx/TOP_DOWN_BUCKET_SIZE]
+			// <= k
+			// <= frontier_degrees_exclusive_sum_buckets_offsets[linear_threadx/TOP_DOWN_BUCKET_SIZE + 1]
+			//
+			// To find the exact value in that range, we need a few values from frontier_degrees_exclusive_sum (see below)
+			// We will load them here
+			// We will load as much as we can - if it doesn't fit we will make multiple iteration of the next loop
+			// Because all vertices in frontier have degree > 0, we know it will fits if left + 1 = right (see below)
+
+			//We're going to load values in frontier_degrees_exclusive_sum for batch [left; right[
+			//If it doesn't fit, --right until it does, then loop
+			//It is excepted to fit on the first try, that's why we start right = nitems_per_thread
+
+			IndexType left = 0;
+			IndexType right = nitems_per_thread;
+
+			while (left < nitems_per_thread) {
+				//
+				// Values that are necessary to compute the local binary searches
+				// We only need those with indexes between extremes indexes of buckets_offsets
+				// We need the next val for the binary search, hence the +1
+				//
+
+				IndexType nvalues_to_load = shared_buckets_offsets[right * NBUCKETS_PER_BLOCK]
+						- shared_buckets_offsets[left * NBUCKETS_PER_BLOCK] + 1;
+
+				//If left = right + 1 we are sure to have nvalues_to_load < TOP_DOWN_EXPAND_DIMX+1
+				while (nvalues_to_load > (TOP_DOWN_EXPAND_DIMX + 1)) {
+					--right;
+
+					nvalues_to_load = shared_buckets_offsets[right * NBUCKETS_PER_BLOCK]
+							- shared_buckets_offsets[left * NBUCKETS_PER_BLOCK] + 1;
+				}
+
+				IndexType nitems_per_thread_for_this_load = right - left;
+
+				IndexType frontier_degrees_exclusive_sum_block_offset = shared_buckets_offsets[left
+						* NBUCKETS_PER_BLOCK];
+
+				//TODO put again the nvalues_to_load == 1
+				if (threadIdx.x < nvalues_to_load) {
+					shared_frontier_degrees_exclusive_sum[threadIdx.x] =
+							frontier_degrees_exclusive_sum[frontier_degrees_exclusive_sum_block_offset
+									+ threadIdx.x];
+				}
+
+				if (nvalues_to_load == (TOP_DOWN_EXPAND_DIMX + 1) && threadIdx.x == 0) {
+					shared_frontier_degrees_exclusive_sum[TOP_DOWN_EXPAND_DIMX] =
+							frontier_degrees_exclusive_sum[frontier_degrees_exclusive_sum_block_offset
+									+ TOP_DOWN_EXPAND_DIMX];
+				}
+
+				//shared_frontier_degrees_exclusive_sum is in shared mem, we will use it, sync
+				//TODO we don't use it if nvalues_to_load == 1
+				__syncthreads();
+
+				// Now we will process the edges
+				// Here each thread will process nitems_per_thread_for_this_load
+				for (IndexType item_index = 0;
+						item_index < nitems_per_thread_for_this_load;
+						item_index += TOP_DOWN_BATCH_SIZE) {
+
+					// We process TOP_DOWN_BATCH_SIZE edge in parallel (instruction parallism)
+					// Reduces latency
+
+					IndexType current_max_edge_index = min(block_offset
+																				+ (left
+																						+ nitems_per_thread_for_this_load)
+																						* blockDim.x,
+																		totaldegree);
+
+					/**
+					 * We will need vec_u (source of the edge) until the end if we need to save the
+					 * predecessors. For others informations, we will reuse pointers on the go
+					 * (nvcc does not color well the registers in that case)
+					 */
+					IndexType vec_u[TOP_DOWN_BATCH_SIZE];
+					IndexType local_buf1[TOP_DOWN_BATCH_SIZE];
+					IndexType local_buf2[TOP_DOWN_BATCH_SIZE];
+
+					IndexType *vec_frontier_degrees_exclusive_sum_index = &local_buf2[0];
+
+#pragma unroll
+					for (IndexType iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+
+						IndexType ibatch = left + item_index + iv;
+						IndexType gid = block_offset + ibatch * blockDim.x + threadIdx.x;
+
+						if (gid < current_max_edge_index) {
+							IndexType start_off_idx = (ibatch * blockDim.x + threadIdx.x)
+									/ TOP_DOWN_BUCKET_SIZE;
+							IndexType bucket_start = shared_buckets_offsets[start_off_idx]
+									- frontier_degrees_exclusive_sum_block_offset;
+							IndexType bucket_end = shared_buckets_offsets[start_off_idx + 1]
+									- frontier_degrees_exclusive_sum_block_offset;
+
+							IndexType k = binsearch_maxle(shared_frontier_degrees_exclusive_sum,
+																	gid,
+																	bucket_start,
+																	bucket_end)
+									+ frontier_degrees_exclusive_sum_block_offset;
+							vec_u[iv] = frontier[k]; // origin of this edge
+							vec_frontier_degrees_exclusive_sum_index[iv] =
+									frontier_degrees_exclusive_sum[k];
+						} else {
+							vec_u[iv] = -1;
+							vec_frontier_degrees_exclusive_sum_index[iv] = -1;
+						}
+
+					}
+
+					IndexType *vec_row_ptr_u = &local_buf1[0];
+
+#pragma unroll
+					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+						IndexType u = vec_u[iv];
+						//row_ptr for this vertex origin u
+						vec_row_ptr_u[iv] = (u != -1) ? row_ptr[u] : -1;
+					}
+
+					//We won't need row_ptr after that, reusing pointer
+					IndexType *vec_dest_v = vec_row_ptr_u;
+
+#pragma unroll
+					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+						IndexType thread_item_index = left + item_index + iv;
+						IndexType gid = block_offset + thread_item_index * blockDim.x + threadIdx.x;
+
+						IndexType row_ptr_u = vec_row_ptr_u[iv];
+						IndexType edge = row_ptr_u + gid - vec_frontier_degrees_exclusive_sum_index[iv];
+
+						//Destination of this edge
+						vec_dest_v[iv] = (row_ptr_u != -1) ? col_ind[edge] : -1;
+//						if (vec_u[iv] != -1 && vec_dest_v[iv] != -1)
+//						printf("Edge to examine: %d, %d\n", vec_u[iv],vec_dest_v[iv]);
+					}
+
+					//We don't need vec_frontier_degrees_exclusive_sum_index anymore
+					IndexType *vec_v_visited_bmap = vec_frontier_degrees_exclusive_sum_index;
+
+#pragma unroll
+					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+						IndexType v = vec_dest_v[iv];
+						vec_v_visited_bmap[iv] = (v != -1) ? visited_bmap[v / INT_SIZE] : (~0); //will look visited
+					}
+
+					// From now on we will consider v as a frontier candidate
+					// If for some reason vec_candidate[iv] should be put in the new_frontier
+					// Then set vec_candidate[iv] = -1
+					IndexType *vec_frontier_candidate = vec_dest_v;
+
+#pragma unroll
+
+					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+						IndexType v = vec_frontier_candidate[iv];
+						int m = 1 << (v % INT_SIZE);
+
+						int is_visited = vec_v_visited_bmap[iv] & m;
+
+						if (is_visited)
+							vec_frontier_candidate[iv] = -1;
+					}
+
+#pragma unroll
+					/**
+					 * Here is where the distances, predecessors, new bitmap frontier and visited bitmap
+					 * get written out.
+					 */
+					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+						IndexType v = vec_frontier_candidate[iv];
+						if (v != -1) {
+							int m = 1 << (v % INT_SIZE);
+							int q = atomicOr(&visited_bmap[v / INT_SIZE], m); //atomicOr returns old
+							int f = atomicOr(&frontier_bmap[v / INT_SIZE], m);
+							if (!(m & q)) { //if this thread was the first to discover this node
+								if (distances)
+									distances[v] = lvl;
+
+								if (predecessors) {
+									IndexType pred = vec_u[iv];
+									predecessors[v] = pred;
+								}
+							}
+						}
+					}
+
+					//We need naccepted_vertices to be ready
+					__syncthreads();
+				}
+
+				//We need to keep shared_frontier_degrees_exclusive_sum coherent
+				__syncthreads();
+
+				//Preparing for next load
+				left = right;
+				right = nitems_per_thread;
+			}
+
+			//we need to keep shared_buckets_offsets coherent
+			__syncthreads();
+		}
+	}
+
+	template<typename IndexType, typename GlobalType>
+	void frontier_expand(const IndexType *row_ptr,
+								const IndexType *col_ind,
+								const IndexType *frontier,
+								const IndexType frontier_size,
+								const IndexType totaldegree,
+								const IndexType lvl,
+								IndexType *frontier_bmap,
+								const IndexType *frontier_degrees_exclusive_sum,
+								const IndexType *frontier_degrees_exclusive_sum_buckets_offsets,
+								int *visited_bmap,
+								IndexType *distances,
+								GlobalType *predecessors,
+								cudaStream_t m_stream) {
+		if (!totaldegree)
+			return;
+
+		dim3 block;
+		block.x = TOP_DOWN_EXPAND_DIMX;
+
+		IndexType max_items_per_thread = (totaldegree + MAXBLOCKS * block.x - 1)
+				/ (MAXBLOCKS * block.x);
+
+		dim3 grid;
+		grid.x = min((totaldegree + max_items_per_thread * block.x - 1)
+									/ (max_items_per_thread * block.x),
+							(IndexType) MAXBLOCKS);
+
+		topdown_expand_kernel<<<grid, block, 0, m_stream>>>(	row_ptr,
+																				col_ind,
+																				frontier,
+																				frontier_size,
+																				totaldegree,
+																				max_items_per_thread,
+																				lvl,
+																				frontier_bmap,
+																				frontier_degrees_exclusive_sum,
+																				frontier_degrees_exclusive_sum_buckets_offsets,
+																				visited_bmap,
+																				distances,
+																				predecessors);
+		cudaCheckError();
+	}
+}
diff --git a/cpp/nvgraph/cpp/include/cnmem_shared_ptr.hxx b/cpp/nvgraph/cpp/include/cnmem_shared_ptr.hxx
new file mode 100644
index 00000000000..2143ec8e4ac
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/cnmem_shared_ptr.hxx
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+#pragma once
+
+#include <cnmem.h>
+#include <cstring>
+
+
+// 
+
+#if __cplusplus > 199711L
+#include <memory>
+#define SHARED_PREFIX std
+
+#else
+#include <boost/shared_ptr.hpp>
+#define SHARED_PREFIX boost
+
+#endif
+
+#include <iostream>
+#include "nvgraph_error.hxx"
+
+namespace nvgraph
+{
+
+template< typename T >
+class DeviceDeleter 
+{
+    cudaStream_t mStream;
+public:
+    DeviceDeleter(cudaStream_t stream) : mStream(stream) {}
+    void operator()(T *ptr) 
+    {
+        cnmemStatus_t status = cnmemFree(ptr, mStream);
+        if( status != CNMEM_STATUS_SUCCESS ) 
+        {
+            FatalError("Memory manager internal error (free)", NVGRAPH_ERR_UNKNOWN);
+        }
+    }
+};
+
+
+template< typename T >
+inline SHARED_PREFIX::shared_ptr<T> allocateDevice(size_t n, cudaStream_t stream) 
+{
+    T *ptr = NULL;
+    cnmemStatus_t status = cnmemMalloc((void**) &ptr, n*sizeof(T), stream);
+    if( status == CNMEM_STATUS_OUT_OF_MEMORY) 
+    {
+        FatalError("Not enough memory", NVGRAPH_ERR_NO_MEMORY);
+    }
+    else if (status != CNMEM_STATUS_SUCCESS)
+    {
+        FatalError("Memory manager internal error (alloc)", NVGRAPH_ERR_UNKNOWN);        
+    }
+    return SHARED_PREFIX::shared_ptr<T>(ptr, DeviceDeleter<T>(stream));
+}
+
+template< typename T >
+class DeviceReleaser 
+{
+    cudaStream_t mStream;
+public:
+    DeviceReleaser(cudaStream_t stream) : mStream(stream) {}
+    void operator()(T *ptr) 
+    {
+
+    }
+};
+
+template< typename T >
+inline SHARED_PREFIX::shared_ptr<T> attachDevicePtr(T * ptr_in, cudaStream_t stream) 
+{
+    T *ptr = ptr_in;
+    return SHARED_PREFIX::shared_ptr<T>(ptr, DeviceReleaser<T>(stream));
+}
+
+
+} // end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/include/common_selector.cuh b/cpp/nvgraph/cpp/include/common_selector.cuh
new file mode 100644
index 00000000000..7a47d5f1300
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/common_selector.cuh
@@ -0,0 +1,1015 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+//#pragma once
+
+namespace nvlouvain{
+
+template <typename T_ELEM> __inline__ __device__ T_ELEM __cachingLoad(const T_ELEM *addr) {
+#if __CUDA_ARCH__ < 350
+  return *addr;
+#else
+  return __ldg(addr);
+#endif
+}
+__device__
+inline float random_weight(int i, int j, int n)
+{
+#define RAND_MULTIPLIER   1145637293
+  int i_min = (min(i, j) * RAND_MULTIPLIER) % n;
+  int i_max = (max(i, j) * RAND_MULTIPLIER) % n;
+  return ((float)i_max / n) * i_min;
+}
+
+/* WARNING: notice that based on the hexadecimal number in the last line 
+   in the hash function the resulting floating point value is very likely 
+   on the order of 0.5. */
+__host__ __device__ inline unsigned int hash_val(unsigned int a, unsigned int seed)
+{
+  a ^= seed;
+  a = (a + 0x7ed55d16) + (a << 12);
+  a = (a ^ 0xc761c23c) + (a >> 19);
+  a = (a + 0x165667b1) + (a << 5);
+  a = (a ^ 0xd3a2646c) + (a << 9);
+  a = (a + 0xfd7046c5) + (a << 3);
+  a = (a ^ 0xb55a4f09) + (a >> 16);
+  return a;
+}
+
+/* return 1e-5 for float [sizeof(float)=4] and 1e-12 for double [sizeof(double)=8] types */
+template<typename WeightType>
+__host__ __device__ WeightType scaling_factor(){
+    return (sizeof(WeightType) == 4) ? 1e-5f : 1e-12; 
+}
+
+// Kernel to compute the weight of the edges
+// original version from AmgX.
+template <typename IndexType, typename ValueType, typename WeightType>
+__global__
+void computeEdgeWeightsBlockDiaCsr_V2( const IndexType* row_offsets, const IndexType *row_indices, const IndexType *column_indices, 
+                            const IndexType *dia_values, const ValueType* nonzero_values, const IndexType num_nonzero_blocks, 
+          WeightType *str_edge_weights, WeightType *rand_edge_weights, int num_owned, int bsize, int component, int weight_formula)
+{
+  int tid= threadIdx.x + blockDim.x*blockIdx.x;
+  
+  int i,j,kmin,kmax;
+  int bsize_sq = bsize*bsize;
+  WeightType den;
+
+  int matrix_weight_entry = component*bsize+component;
+
+  while (tid < num_nonzero_blocks)
+  {
+      i = row_indices[tid];
+      j = column_indices[tid];
+
+      if ((i != j) && (j < num_owned)) // skip diagonal and across-boundary edges
+      {
+        den = (WeightType) max(fabs(__cachingLoad(&nonzero_values[dia_values[i]*bsize_sq+matrix_weight_entry])),fabs(__cachingLoad(&nonzero_values[dia_values[j]*bsize_sq+matrix_weight_entry])));
+
+        kmin = __cachingLoad(&row_offsets[j]); //kmin = row_offsets[j];
+        kmax = __cachingLoad(&row_offsets[j+1]); //kmax = row_offsets[j+1];
+
+        WeightType kvalue = 0.0;
+        bool foundk = false; 
+        for (int k=kmin;k<kmax;k++)
+        {
+          if ((column_indices[k] == i) /* && (column_indices[k] < num_owned) */) 
+          {
+            kvalue = __cachingLoad(&nonzero_values[k*bsize_sq+matrix_weight_entry]); //kvalue = nonzero_values[k*bsize_sq+matrix_weight_entry];
+            foundk = true;
+            break;
+          }
+        }
+
+        // handles both symmetric & non-symmetric matrices
+        WeightType ed_weight=0;
+        if( foundk )
+        {
+            if( weight_formula == 0 )
+                ed_weight =  0.5*(fabs(__cachingLoad(&nonzero_values[tid*bsize_sq+matrix_weight_entry])) + fabs(kvalue)) / den; // 0.5*(aij+aji)/max(a_ii,a_jj)
+            else
+                ed_weight = -0.5 * ( __cachingLoad(&nonzero_values[tid*bsize_sq+matrix_weight_entry]) / __cachingLoad(&nonzero_values[dia_values[i]*bsize_sq+matrix_weight_entry])  + // -0.5 * ( a_ij/a_ii +
+                                    kvalue / __cachingLoad(&nonzero_values[dia_values[j]*bsize_sq+matrix_weight_entry]) );                                                            //          a_ji/a_jj )
+        }
+
+
+        // 05/09/13: Perturb the edge weights slightly to handle cases where edge weights are uniform
+        WeightType small_fraction = scaling_factor<WeightType>()*hash_val(min(i,j),max(i,j))/UINT_MAX;
+        ed_weight += small_fraction*ed_weight;
+        str_edge_weights[tid] = ed_weight;
+
+        // fill up random unique weights 
+        if( rand_edge_weights != NULL )
+          rand_edge_weights[tid] = random_weight(i, j, num_owned);
+      }
+      tid += gridDim.x*blockDim.x;
+  }
+}
+
+// Kernel to compute the weight of the edges
+// simple version modified for nvgraph
+template <typename IndexType, typename ValueType, typename WeightType>
+__global__
+void computeEdgeWeights_simple( const IndexType* row_offsets, const IndexType *row_indices, const IndexType *column_indices, 
+                            const ValueType *row_sum, const ValueType* nonzero_values, const IndexType num_nonzero_blocks, 
+          WeightType *str_edge_weights, WeightType *rand_edge_weights, int n, int weight_formula)
+{
+  int tid= threadIdx.x + blockDim.x*blockIdx.x;
+  
+  int i,j,kmin,kmax;
+  WeightType den;
+
+  while (tid < num_nonzero_blocks)
+  {
+      i = row_indices[tid];
+      j = column_indices[tid];
+
+      if ((i != j) && (j < n)) // skip diagonal and across-boundary edges
+      {
+        den = (WeightType) max(fabs(__cachingLoad(&row_sum[i])),fabs(__cachingLoad(&row_sum[j])));
+
+        kmin = __cachingLoad(&row_offsets[j]); //kmin = row_offsets[j];
+        kmax = __cachingLoad(&row_offsets[j+1]); //kmax = row_offsets[j+1];
+
+        WeightType kvalue = 0.0;
+        bool foundk = false; 
+        for (int k=kmin;k<kmax;k++)
+        {
+          if ((column_indices[k] == i) /* && (column_indices[k] < n) */) 
+          {
+            kvalue = __cachingLoad(&nonzero_values[k]); //kvalue = nonzero_values[k];
+            foundk = true;
+            break;
+          }
+        }
+
+        // handles both symmetric & non-symmetric matrices
+        WeightType ed_weight=0;
+        if( foundk )
+        {
+            if( weight_formula == 0 )
+                ed_weight =  0.5*(fabs(__cachingLoad(&nonzero_values[tid])) + fabs(kvalue)) / den; // 0.5*(aij+aji)/max(a_ii,a_jj)
+            else
+                ed_weight = -0.5 * ( __cachingLoad(&nonzero_values[tid]) / __cachingLoad(&row_sum[i])  + // -0.5 * ( a_ij/a_ii +
+                                    kvalue / __cachingLoad(&row_sum[j]) );                                                            //          a_ji/a_jj )
+        }
+
+        // 05/09/13: Perturb the edge weights slightly to handle cases where edge weights are uniform
+        WeightType small_fraction = scaling_factor<WeightType>()*hash_val(min(i,j),max(i,j))/UINT_MAX;
+        ed_weight += small_fraction*ed_weight;
+        str_edge_weights[tid] = ed_weight;
+
+        // fill up random unique weights 
+        if( rand_edge_weights != NULL )
+          rand_edge_weights[tid] = random_weight(i, j, n);
+      }
+      tid += gridDim.x*blockDim.x;
+  }
+}
+
+// Kernel to compute the weight of the edges using geometry distance between edges
+template <typename IndexType, typename ValueType>
+__global__
+void computeEdgeWeightsDistance3d( const int* row_offsets, const IndexType *column_indices, 
+                                   const ValueType* gx, const ValueType* gy, const ValueType* gz, float *str_edge_weights, int num_rows)
+{
+  int tid= threadIdx.x + blockDim.x*blockIdx.x;
+  float lx, ly, lz;  
+  float px, py, pz;
+  int kmin, kmax;
+  int col_id;
+
+  while (tid < num_rows)
+  {
+      lx = gx[tid];
+      ly = gy[tid];
+      lz = gz[tid];
+      kmin = row_offsets[tid];
+      kmax = row_offsets[tid+1];
+
+      for (int k=kmin;k<kmax;k++)
+      {
+        col_id = column_indices[k];
+        if (col_id != tid)      // skip diagonal
+        {
+          px = gx[col_id];
+          py = gy[col_id];
+          pz = gz[col_id];
+
+          str_edge_weights[k] =  1.0 / sqrt((px - lx)*(px - lx) + (py - ly)*(py - ly) + (pz - lz)*(pz - lz));
+        }
+      }
+      tid += gridDim.x*blockDim.x;
+  }
+}
+
+// Kernel that checks if perfect matchs exist
+template <typename IndexType>
+__global__
+void matchEdges(const IndexType num_rows, IndexType *partner_index, IndexType *aggregates, const IndexType *strongest_neighbour)
+{
+  int potential_match, potential_match_neighbour;
+
+  for (int tid= threadIdx.x + blockDim.x*blockIdx.x; tid < num_rows; tid += gridDim.x*blockDim.x)
+  {
+    if (partner_index[tid] == -1) // Unaggregated row
+    {
+      potential_match = strongest_neighbour[tid];
+      if (potential_match!=-1)
+      {
+          potential_match_neighbour = strongest_neighbour[potential_match];
+
+          if ( potential_match_neighbour == tid ) // we have a match
+          {
+            partner_index[tid] = potential_match;
+            aggregates[tid] = ( potential_match > tid) ? tid : potential_match;
+          }
+      }
+    }
+  }
+}
+
+template <typename IndexType>
+__global__
+void joinExistingAggregates(IndexType num_rows, IndexType *aggregates, IndexType *aggregated, const IndexType *aggregates_candidate)
+{
+   int tid= threadIdx.x + blockDim.x*blockIdx.x;
+  
+  while (tid < num_rows)
+  {
+    if (aggregated[tid] == -1 && aggregates_candidate[tid] != -1) // Unaggregated row
+    {
+      aggregates[tid] = aggregates_candidate[tid];
+      aggregated[tid] = 1;
+    }
+
+  tid += gridDim.x*blockDim.x;
+  }
+}
+
+
+template<typename IndexType>
+__global__
+void aggregateSingletons( IndexType* aggregates, IndexType numRows )
+{
+    int tid = threadIdx.x + blockDim.x*blockIdx.x;
+
+    while( tid < numRows )
+    {
+        if( aggregates[tid] == -1 ) //still unaggregated!
+            aggregates[tid] = tid; //then become a singleton
+
+        tid += gridDim.x*blockDim.x;
+    }
+}
+
+__device__
+inline float random_weight2(int i, int j)
+{
+#define RAND_MULTIPLIER         1145637293
+  unsigned long i_min = (min(i, j) * RAND_MULTIPLIER);
+  unsigned long i_max = (max(i, j) * RAND_MULTIPLIER);
+  return ((float)i_min / i_max);
+}
+
+
+// findStrongestNeighbour kernel for block_dia_csr_matrix format
+// Reads the weight from edge_weights array
+template <typename IndexType>
+__global__
+void findStrongestNeighbourBlockDiaCsr_V2(const IndexType *row_offsets, const IndexType *column_indices,
+                                        const float *edge_weights, IndexType n, IndexType *aggregates, 
+          IndexType *strongest_neighbour_1phase, IndexType *strongest_neighbour,
+          const size_t bsize, int phase, bool merge_singletons)
+{
+  int tid = threadIdx.x + blockDim.x*blockIdx.x;
+  
+  float   weight;
+  int jcol;
+
+  while (tid < n)
+  {
+    int strongest_unaggregated = -1;
+    int strongest_aggregated = -1;
+    float   max_weight_unaggregated = 0.;
+    float   max_weight_aggregated = 0.;
+    if (aggregates[tid] == -1) // Unaggregated row
+    {
+      for (int j=row_offsets[tid]; j<row_offsets[tid+1]; j++)
+      {
+        //TODO: check if aggregated before computing the weight
+        jcol = column_indices[j];
+//        if (phase == 1) weight = edge_weights[j];
+//        else weight = random_weight2(tid, jcol);
+        weight = edge_weights[j];
+//        printf("j: %d weight %f\n", j, weight);
+
+        if (tid == jcol || jcol >= n) continue;  // skip diagonal and halo
+        if (phase == 2 && strongest_neighbour_1phase[jcol] != tid) continue; // if 2nd phase only accept those who gave a hand on the 1st phase
+
+        // Identify strongest aggregated and unaggregated neighbours
+        if (aggregates[jcol] == -1 && (weight > max_weight_unaggregated || (weight==max_weight_unaggregated && jcol > strongest_unaggregated))) // unaggregated
+        {
+          max_weight_unaggregated= weight;
+          strongest_unaggregated= jcol;
+          // find the smallestt index with weight = max_weight
+        }
+        else if (aggregates[jcol] != -1 && (weight > max_weight_aggregated || (weight==max_weight_aggregated && jcol > strongest_aggregated))) // aggregated
+        {
+           max_weight_aggregated = weight; 
+           strongest_aggregated = jcol;
+        }
+      }
+//      printf("-- phase: %d tid: %d strongest_neighbour: %d %f\n", phase, tid, strongest_neighbour[tid], max_weight_unaggregated);
+
+      if (strongest_unaggregated == -1 && strongest_aggregated != -1) // All neighbours are aggregated
+      {
+        if( merge_singletons ){
+            // Put in same aggregate as strongest neighbour
+            aggregates[tid] = aggregates[strongest_aggregated];
+        }
+        else{
+            aggregates[tid] = tid;
+        }
+      }
+      else if (strongest_unaggregated != -1) {
+
+        if (phase == 2) {
+            float rand_w1 = random_weight2(tid, strongest_neighbour_1phase[tid]); 
+          strongest_neighbour[tid] = max_weight_unaggregated > rand_w1 ? strongest_unaggregated : strongest_neighbour_1phase[tid];
+        }
+        else strongest_neighbour_1phase[tid] = strongest_unaggregated;
+      
+        //strongest_neighbour_1phase[tid] = strongest_unaggregated;
+      }
+
+      else {
+        if (phase == 2) strongest_neighbour[tid] = strongest_neighbour_1phase[tid];
+        else strongest_neighbour_1phase[tid] = tid;
+      }
+    }
+/*
+    if(tid<16)
+      printf("++ phase: %d tid: %d strongest_neighbour: %d %f\n", phase, tid, strongest_neighbour[tid], max_weight_unaggregated);
+ */
+    tid += gridDim.x*blockDim.x;
+ }
+}
+
+// Kernel that checks if perfect matchs exist
+template <typename IndexType>
+__global__
+void matchEdges(const IndexType num_rows, IndexType *aggregates, const int *strongest_neighbour)
+{
+  int tid= threadIdx.x + blockDim.x*blockIdx.x;
+  int potential_match, potential_match_neighbour;
+
+  while (tid < num_rows)
+  {
+    if (aggregates[tid] == -1) // Unaggregated row
+    {
+      potential_match = strongest_neighbour[tid];
+      potential_match_neighbour = strongest_neighbour[potential_match];
+
+      if (potential_match != -1 && potential_match_neighbour == tid) // we have a match
+        aggregates[tid] = ( potential_match > tid ) ? tid : potential_match;
+      /*
+      if (potential_match != -1){
+          potential_match_neighbour = strongest_neighbour[potential_match];
+
+          if (potential_match_neighbour == tid) // we have a match
+              aggregates[tid] = ( potential_match > tid ) ? tid : potential_match;
+      }
+      */
+    }
+  tid += gridDim.x*blockDim.x;
+  }
+}
+
+template <typename IndexType, int block_size>
+__global__
+void countAggregates(const IndexType num_rows, const IndexType *aggregates, int *num_unaggregated)
+{
+  int tid = threadIdx.x + blockDim.x * blockIdx.x; 
+  int c = 0;
+  int i = tid;
+  while( i < num_rows ) {
+    c += ( aggregates[i] == -1 );   
+    i += gridDim.x * blockDim.x;
+  }
+  __shared__ volatile int smem[block_size];
+  smem[threadIdx.x] = c;  
+  __syncthreads();
+
+  for( int off = blockDim.x / 2; off >= 32; off = off / 2 ) {
+    if( threadIdx.x < off ) 
+      smem[threadIdx.x] += smem[threadIdx.x + off];
+    __syncthreads();
+  }
+
+  // warp reduce
+  if( threadIdx.x < 32 ) {
+    smem[threadIdx.x] += smem[threadIdx.x+16];
+    smem[threadIdx.x] += smem[threadIdx.x+8];
+    smem[threadIdx.x] += smem[threadIdx.x+4];
+    smem[threadIdx.x] += smem[threadIdx.x+2];
+    smem[threadIdx.x] += smem[threadIdx.x+1];
+  }
+
+  if( threadIdx.x == 0 )
+    atomicAdd(num_unaggregated, smem[0]);
+}
+
+
+template <typename IndexType>
+__global__
+void joinExistingAggregates(IndexType num_rows, IndexType *aggregates, const IndexType *aggregates_candidate)
+{
+   int tid= threadIdx.x + blockDim.x*blockIdx.x;
+  
+  while (tid < num_rows)
+  {
+    if (aggregates[tid] == -1 && aggregates_candidate[tid] != -1) // Unaggregated row
+      aggregates[tid] = aggregates_candidate[tid];
+
+    tid+=gridDim.x*blockDim.x;
+  }
+}
+
+
+
+// Kernel that merges unaggregated vertices its strongest aggregated neighbour
+// Weights are read from edge_weights array
+// For block_dia_csr_matrix_format
+template <typename IndexType>
+__global__
+void mergeWithExistingAggregatesBlockDiaCsr_V2(const IndexType *row_offsets, const IndexType *column_indices, const float *edge_weights,
+                                            const int n, IndexType *aggregates, int bsize, const int deterministic, IndexType *aggregates_candidate)
+{
+  int tid= threadIdx.x + blockDim.x*blockIdx.x;
+  
+  int jcol;
+  float weight;
+  
+   
+  while (tid < n)
+  {
+    float max_weight_aggregated = 0.;
+    int strongest_aggregated = -1;
+    if (aggregates[tid] == -1) // Unaggregated row
+    {
+      for (int j=row_offsets[tid]; j<row_offsets[tid+1]; j++)
+      {
+        // Compute edge weight
+        weight = edge_weights[j];
+        jcol = column_indices[j];
+
+        if (jcol == tid || jcol >= n) continue;  // skip diagonal
+
+        // Identify strongest aggregated neighbour
+        if (aggregates[jcol] != -1 && (weight > max_weight_aggregated || (weight==max_weight_aggregated && jcol > strongest_aggregated))) // 
+        {
+           max_weight_aggregated = weight; 
+           strongest_aggregated = jcol;
+        }
+      }
+
+      if (strongest_aggregated != -1) // Found a neighbour to aggregate to
+      {
+        if (deterministic) {
+          aggregates_candidate[tid] = aggregates[strongest_aggregated];
+        }
+        else {
+          // Put in same aggregate as strongest neighbour
+          aggregates[tid] = aggregates[strongest_aggregated];
+        }
+      }
+      else // All neighbours are unaggregated, leave alone
+      {
+        if (deterministic)
+          aggregates_candidate[tid] = tid;
+        else
+          aggregates[tid] = tid; 
+      }
+
+
+    }
+  tid += gridDim.x*blockDim.x;
+  }
+}
+
+
+
+template <typename INDEX_TYPE>
+__global__ void computeDiagonalKernelCSR(INDEX_TYPE num_rows, const INDEX_TYPE *row_offsets, const INDEX_TYPE *col_indices, INDEX_TYPE *diag) {
+
+  INDEX_TYPE row=(blockIdx.x*blockDim.x+threadIdx.x);
+
+  while(row<num_rows) {
+    int nz=row_offsets[row];
+    int last_nz=row_offsets[row+1];
+    //diag[row] = null_index;
+    while(nz<last_nz) {
+      int col=col_indices[nz];
+      if(row==col) {
+        diag[row]=nz;
+        //diag_end_offsets[row]=nz+1;
+        break;
+      }
+      nz++;
+    }
+    row+=blockDim.x*gridDim.x;
+  }
+}
+
+template <typename T1, typename T2>
+__global__ void convert_type(int n, const T1 *src, T2 *dest) {
+
+  int tid=(blockIdx.x*blockDim.x+threadIdx.x);
+  while(tid<n) 
+  {
+    dest[tid] = static_cast<T2>(src[tid]);
+    tid += gridDim.x*blockDim.x;    
+  }
+}
+}//nvlouvain
+
+/*
+
+// findStrongestNeighbour kernel for block_dia_csr_matrix format
+// Reads the weight from edge_weights array
+template <typename IndexType>
+__global__
+void agreeOnProposal(const IndexType *row_offsets, const IndexType *column_indices,
+                                        IndexType num_block_rows, IndexType *aggregated, int *strongest_neighbour, float *weight_strongest_neighbour, IndexType *partner_index, int *aggregates)
+{
+  int tid= threadIdx.x + blockDim.x*blockIdx.x;
+  int partner;
+
+  while(tid < num_block_rows)
+  {
+    if (aggregated[tid] == -1)
+    {
+      partner = partner_index[tid];
+      float my_weight = weight_strongest_neighbour[tid];
+      float partners_weight = -1;
+      if (partner != -1) partners_weight = weight_strongest_neighbour[partner];
+
+        if (my_weight < 0. && partners_weight < 0.) { // All neighbours are aggregated, leave in current aggregate
+        //if (deterministic!=1)
+        //{
+          aggregated[tid] = 1; 
+          strongest_neighbour[tid] = -1;
+          partner_index[tid+num_block_rows] = tid;
+          partner_index[tid+2*num_block_rows] = tid;
+        //}
+        }
+        // if my weight is smaller than my partner's weight, change my strongest neighbour
+        else if (my_weight < partners_weight)
+          strongest_neighbour[tid] = strongest_neighbour[partner];
+
+    }
+  tid += gridDim.x*blockDim.x;
+  }
+}
+
+// Kernel that checks if perfect matchs exist
+template <typename IndexType>
+__global__
+void matchAggregates(IndexType *aggregates, IndexType *aggregated, IndexType *strongest_neighbour, const IndexType num_rows)
+{
+  int tid= threadIdx.x + blockDim.x*blockIdx.x;
+  int potential_match, potential_match_neighbour, my_aggregate;
+
+  while (tid < num_rows)
+  {
+    if (aggregated[tid] == -1) // Unaggregated row
+    {
+
+      potential_match = strongest_neighbour[tid];
+      if (potential_match!=-1)
+      {
+          potential_match_neighbour = strongest_neighbour[potential_match];
+
+          my_aggregate = aggregates[tid];
+
+          if (potential_match_neighbour == my_aggregate) // we have a match
+          {
+            aggregated[tid] = 1;
+            aggregates[tid] = ( potential_match > my_aggregate) ? my_aggregate: potential_match;
+          }
+      }
+    }
+  tid += gridDim.x*blockDim.x;
+  }
+}
+
+// Kernel that checks if perfect matchs exist
+template <typename IndexType>
+__global__
+void assignUnassignedVertices(IndexType *partner_index, const IndexType num_rows)
+{
+  int tid= threadIdx.x + blockDim.x*blockIdx.x;
+
+  while (tid < num_rows)
+  {
+    if (partner_index[tid] == -1) // Unaggregated row
+    {
+      partner_index[tid] = tid;
+    }
+  tid += gridDim.x*blockDim.x;
+  }
+}
+
+// Kernel that merges unaggregated vertices its strongest aggregated neighbour
+// Edge weights are computed on the fly
+// For block_dia_csr_matrix_format
+template <typename IndexType, typename ValueType>
+__global__
+void mergeWithExistingAggregatesBlockDiaCsr(const IndexType *row_offsets, const IndexType *column_indices, const ValueType *dia_values, const ValueType *nonzero_values,
+                                            const int n, IndexType *aggregates, int bsize, int deterministic, IndexType *aggregates_candidate)
+{
+  int tid= threadIdx.x + blockDim.x*blockIdx.x;
+  
+  int jcol;
+  ValueType weight;
+  int bsize_sq = bsize*bsize;
+
+  while (tid < n)
+  {
+    int strongest_aggregated = -1;
+    ValueType max_weight_aggregated = 0.;
+    if (aggregates[tid] == -1) // Unaggregated row
+    {
+      for (int j=row_offsets[tid]; j<row_offsets[tid+1]; j++)
+      {
+        jcol = column_indices[j];
+        if (jcol >= n) continue;
+        // Compute edge weight
+        weight = fabs(nonzero_values[j*bsize_sq])/max( fabs(dia_values[tid*bsize_sq]),fabs(dia_values[jcol*bsize_sq]));
+
+        // Identify strongest aggregated neighbour
+        if (aggregates[jcol] != -1 && (weight > max_weight_aggregated || (weight==max_weight_aggregated && jcol > strongest_aggregated))) // aggregated
+        {
+           max_weight_aggregated = weight; 
+           strongest_aggregated = jcol;
+        }
+      }
+
+      if (strongest_aggregated != -1) // Found a neighbour to aggregate to
+      {
+        if (deterministic) {
+          aggregates_candidate[tid] = aggregates[strongest_aggregated];
+        }
+        else {
+          // Put in same aggregate as strongest neighbour
+          aggregates[tid] = aggregates[strongest_aggregated];
+        }
+      }
+      else // All neighbours are unaggregated, leave alone
+      {
+        if (deterministic)
+          aggregates_candidate[tid] = tid;
+        else
+          aggregates[tid] = tid; 
+      }
+    }
+  tid += gridDim.x*blockDim.x;
+  }
+}
+
+// findStrongestNeighbour kernel for block_dia_csr_matrix format
+// Reads the weight from edge_weights array
+template <typename IndexType>
+__global__
+void findStrongestNeighbourBlockDiaCsr_NoMerge(const IndexType *row_offsets, const IndexType *column_indices,
+                                        float *edge_weights, const IndexType num_block_rows, IndexType* partner_index, int *strongest_neighbour, int deterministic)
+{
+  int tid= threadIdx.x + blockDim.x*blockIdx.x;
+  int jmin,jmax; 
+  float weight;
+
+  int jcol;
+
+  while (tid < num_block_rows)
+  {
+    float max_weight_unaggregated = 0.;
+    int strongest_unaggregated = -1;
+
+    if (partner_index[tid] == -1) // Unaggregated row
+    {
+      jmin = row_offsets[tid];
+      jmax = row_offsets[tid+1];
+
+      for (int j=jmin; j<jmax; j++)
+      {
+        jcol = column_indices[j];
+        if (tid == jcol || jcol >= num_block_rows) continue; // Skip diagonal and boundary edges.
+        weight = edge_weights[j];
+        // Identify strongest unaggregated neighbours
+        if (partner_index[jcol] == -1 && (weight > max_weight_unaggregated || (weight==max_weight_unaggregated && jcol > strongest_unaggregated))) // unaggregated
+        {
+          max_weight_unaggregated= weight;
+          strongest_unaggregated= jcol;
+        }
+      }
+
+      if (strongest_unaggregated == -1) // All neighbours are aggregated
+      {
+        // Put in its own aggregate
+        if (!deterministic)
+          partner_index[tid] = tid;
+      }
+      else
+      {
+        strongest_neighbour[tid] = strongest_unaggregated;
+      }
+
+      //if (strongest_unaggregated != -1) // All neighbours are aggregated
+      //  strongest_neighbour[tid] = strongest_unaggregated;
+        // Put in its own aggregate
+      //  partner_index[tid] = tid;
+      //else
+
+
+    }
+
+  tid += gridDim.x*blockDim.x;
+  }
+}
+
+// findStrongestNeighbour kernel for block_dia_csr_matrix format
+// Reads the weight from edge_weights array
+template <typename IndexType>
+__global__
+void findStrongestNeighbourBlockDiaCsr_StoreWeight(const IndexType *row_offsets, const IndexType *column_indices,
+                                        const float *edge_weights, const IndexType num_block_rows, IndexType *aggregated, IndexType *aggregates, int *strongest_neighbour, IndexType *partner_index, float *weight_strongest_neighbour, int deterministic)
+{
+  int tid= threadIdx.x + blockDim.x*blockIdx.x;
+  
+  float weight;
+
+  int jcol,jmin,jmax;
+  int agg_jcol;
+
+  while (tid < num_block_rows)
+  {
+    float max_weight_unaggregated = 0.;
+    float max_weight_aggregated = 0.;
+    int strongest_unaggregated = -1;
+    int strongest_aggregated = -1;
+    int partner = -1;
+    if (aggregated[tid] == -1) // Unaggregated row
+    {
+      partner = partner_index[tid];
+      jmin = row_offsets[tid];
+      jmax = row_offsets[tid+1];
+
+      for (int j=jmin; j<jmax; j++)
+      {
+        jcol = column_indices[j];
+        if (tid == jcol || jcol >= num_block_rows) continue; // Skip diagonal and boundary edges.
+        weight = edge_weights[j];
+
+        agg_jcol = aggregated[jcol];
+
+        if (agg_jcol == -1 && jcol != partner && (weight > max_weight_unaggregated || (weight==max_weight_unaggregated && jcol > strongest_unaggregated))) // unaggregated
+        {
+            max_weight_unaggregated= weight;
+            strongest_unaggregated= jcol;
+        }
+        else if (agg_jcol != -1 && jcol != partner && (weight > max_weight_aggregated || (weight==max_weight_aggregated && jcol > strongest_aggregated))) // unaggregated
+        {
+           max_weight_aggregated = weight; 
+           strongest_aggregated = jcol;
+        }
+      }
+
+      if (strongest_unaggregated== -1) // All neighbours are aggregated
+      {
+        if (!deterministic)
+        {
+          if (strongest_aggregated != -1) {
+            aggregates[tid] = aggregates[strongest_aggregated];
+            aggregated[tid] = 1;
+            if (partner != -1) {
+              aggregates[partner] = aggregates[strongest_aggregated];
+              aggregated[partner] = 1;
+      }
+          }
+          else {// leave in its own aggregate
+            if (partner != -1) 
+        aggregated[partner] = 1; 
+            aggregated[tid] = 1; 
+          }
+        }
+
+      }
+      else // Found an unaggregated aggregate
+      {
+        weight_strongest_neighbour[tid] = max_weight_unaggregated;
+        strongest_neighbour[tid] = aggregates[strongest_unaggregated];
+      }
+    }
+  tid += gridDim.x*blockDim.x;
+  }
+}
+
+// findStrongestNeighbour kernel for block_dia_csr_matrix format
+// computes weight on the fly
+template <typename IndexType, typename ValueType>
+__global__
+void findStrongestNeighbourBlockDiaCsr(const IndexType *row_offsets, const IndexType *column_indices, 
+                            const ValueType *dia_values, const ValueType *nonzero_values, const IndexType n, IndexType *aggregates, int *strongest_neighbour, int bsize)
+{
+  int tid= threadIdx.x + blockDim.x*blockIdx.x;
+  
+  ValueType weight;
+
+  int jcol;
+  int bsize_sq = bsize*bsize;
+
+  while (tid < n)
+  {
+    ValueType max_weight_unaggregated = 0.;
+    ValueType max_weight_aggregated = 0.;
+    int strongest_unaggregated = -1;
+    int strongest_aggregated = -1;
+    if (aggregates[tid] == -1) // Unaggregated row
+    {
+      for (int j=row_offsets[tid]; j<row_offsets[tid+1]; j++)
+      {
+        jcol = column_indices[j];
+        if (jcol >= n) continue;
+
+        // Compute edge weight
+        for (int k=row_offsets[jcol];k<row_offsets[jcol+1];k++)
+        {
+          if (column_indices[k] == tid)
+          {
+            weight = 0.5*(fabs(nonzero_values[j*bsize_sq]) + fabs(nonzero_values[k*bsize_sq]))
+                     / max( fabs(dia_values[tid*bsize_sq]),fabs(dia_values[jcol*bsize_sq]));
+            break;
+          }
+        }
+
+        // Identify strongest aggregated and unaggregated neighbours
+        if (aggregates[jcol] == -1 && (weight > max_weight_unaggregated || (weight==max_weight_unaggregated && jcol > strongest_unaggregated))) // unaggregated
+        {
+          max_weight_unaggregated= weight;
+          strongest_unaggregated= jcol;
+        }
+        else if (aggregates[jcol] != -1 && (weight > max_weight_aggregated || (weight==max_weight_aggregated && jcol > strongest_aggregated))) // aggregated
+        {
+           max_weight_aggregated = weight; 
+           strongest_aggregated = jcol;
+        }
+      }
+      if (strongest_unaggregated == -1 && strongest_aggregated != -1) // All neighbours are aggregated
+        // Put in same aggregate as strongest neighbour
+        aggregates[tid] = aggregates[strongest_aggregated];
+      else if (strongest_unaggregated != -1)
+        strongest_neighbour[tid] = strongest_unaggregated;
+      else
+        strongest_neighbour[tid] = tid;
+    }
+  tid += gridDim.x*blockDim.x;
+  }
+}
+
+// Kernel that merges unaggregated vertices its strongest aggregated neighbour
+// Weights are read from edge_weights array
+// For block_dia_csr_matrix_format
+template <typename IndexType>
+__global__
+void mergeWithExistingAggregatesBlockDiaCsr(const IndexType *row_offsets, const IndexType *column_indices, const float *edge_weights,
+                                            const int num_block_rows, IndexType *aggregates, IndexType *aggregated, int deterministic, IndexType *aggregates_candidate, bool allow_singletons = true)
+{
+  int tid= threadIdx.x + blockDim.x*blockIdx.x;
+  
+  int jcol;
+
+  float weight;
+
+  while (tid < num_block_rows)
+  {
+    float max_weight_aggregated = 0.;
+    int strongest_aggregated = -1;
+    if (aggregated[tid] == -1) // Unaggregated row
+    {
+      for (int j=row_offsets[tid]; j<row_offsets[tid+1]; j++)
+      {
+        jcol = column_indices[j];
+        if (tid == jcol || jcol >= num_block_rows) continue; // Skip diagonal and boundary edges.
+        // Identify strongest aggregated neighbour
+        if (aggregated[jcol] != -1) {
+
+          weight = edge_weights[j];
+          if (weight > max_weight_aggregated || (weight == max_weight_aggregated && jcol > strongest_aggregated)) {
+           max_weight_aggregated = weight; 
+           strongest_aggregated = jcol;
+          }
+
+        }
+      }
+
+      if (strongest_aggregated != -1) {
+        if (deterministic)
+        {
+          aggregates_candidate[tid] = aggregates[strongest_aggregated];
+        }
+        else
+        {
+          // Put in same aggregate as strongest neighbour
+          aggregates[tid] = aggregates[strongest_aggregated];
+          aggregated[tid] = 1;
+        }
+      }
+      else // All neighbours are unaggregated, leave alone
+      {
+        if (deterministic) {
+          if (allow_singletons) aggregates_candidate[tid] = tid;
+        }
+        else
+          aggregates[tid] = tid; 
+      }
+
+    }
+  tid += gridDim.x*blockDim.x;
+  }
+}
+
+// Kernel to extract diagonal for csr_matrix format
+template <typename IndexType, typename ValueType>
+__global__
+void getDiagonalKernel(const IndexType *offsets, const IndexType *column_indices,
+                       const ValueType *values, const IndexType numRows, ValueType *diagonal)
+{
+  int tIdx = threadIdx.x + blockDim.x*blockIdx.x;
+
+  while (tIdx < numRows)
+  {
+    const int offset = offsets[tIdx];
+    const int numj = offsets[tIdx+1]-offset;
+
+    for (int j=offset; j < offset+numj; j++)
+    {
+      int jcol = column_indices[j];
+      if (tIdx == jcol)
+      {
+        diagonal[tIdx] = values[j];
+      }
+    }
+  tIdx += gridDim.x*blockDim.x;
+  }
+}
+
+template <typename INDEX_TYPE>
+__global__ void computeDiagonalKernelCOO(INDEX_TYPE num_nz, INDEX_TYPE *row_indices, INDEX_TYPE *col_indices, INDEX_TYPE *diag) {
+  //BLOCKY*BLOCKX threads per nz
+  INDEX_TYPE nz=(blockIdx.x*blockDim.x+threadIdx.x);
+
+  while(nz<num_nz) {
+    INDEX_TYPE row=row_indices[nz];
+    INDEX_TYPE col=col_indices[nz];
+
+    if(row==col) {
+      //copy block to diag
+      diag[row]=nz;
+      //diag_end_offsets[row]=nz+1;
+    }
+
+    nz+=blockDim.x*gridDim.x;
+  }
+}
+
+// Kernel to extract diagonal for csr_matrix format
+template <typename IndexType, typename ValueType>
+__global__
+void getDiagonalKernelNoDiaProp(const IndexType *dia_idx, const ValueType *values, const IndexType numRows, ValueType *diagonal)
+{
+  int tIdx = threadIdx.x + blockDim.x*blockIdx.x;
+
+  while (tIdx < numRows)
+  {
+    diagonal[tIdx] = values[dia_idx[tIdx]];
+    tIdx += gridDim.x*blockDim.x;
+  }
+}
+
+
+
+*/
diff --git a/cpp/nvgraph/cpp/include/common_selector.hxx b/cpp/nvgraph/cpp/include/common_selector.hxx
new file mode 100644
index 00000000000..c0a1baac64e
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/common_selector.hxx
@@ -0,0 +1,995 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+template <typename T_ELEM> __inline__ __device__ T_ELEM __cachingLoad(const T_ELEM *addr) {
+#if __CUDA_ARCH__ < 350
+  return *addr;
+#else
+  return __ldg(addr);
+#endif
+}
+__device__
+float random_weight(int i, int j, int n)
+{
+#define RAND_MULTIPLIER   1145637293
+  int i_min = (min(i, j) * RAND_MULTIPLIER) % n;
+  int i_max = (max(i, j) * RAND_MULTIPLIER) % n;
+  return ((float)i_max / n) * i_min;
+}
+
+/* WARNING: notice that based on the hexadecimal number in the last line 
+   in the hash function the resulting floating point value is very likely 
+   on the order of 0.5. */
+__host__ __device__ unsigned int hash_val(unsigned int a, unsigned int seed)
+{
+  a ^= seed;
+  a = (a + 0x7ed55d16) + (a << 12);
+  a = (a ^ 0xc761c23c) + (a >> 19);
+  a = (a + 0x165667b1) + (a << 5);
+  a = (a ^ 0xd3a2646c) + (a << 9);
+  a = (a + 0xfd7046c5) + (a << 3);
+  a = (a ^ 0xb55a4f09) + (a >> 16);
+  return a;
+}
+
+/* return 1e-5 for float [sizeof(float)=4] and 1e-12 for double [sizeof(double)=8] types */
+template<typename WeightType>
+__host__ __device__ WeightType scaling_factor(){
+    return (sizeof(WeightType) == 4) ? 1e-5f : 1e-12; 
+}
+
+// Kernel to compute the weight of the edges
+// original version from AmgX.
+template <typename IndexType, typename ValueType, typename WeightType>
+__global__
+void computeEdgeWeightsBlockDiaCsr_V2( const IndexType* row_offsets, const IndexType *row_indices, const IndexType *column_indices, 
+                            const IndexType *dia_values, const ValueType* nonzero_values, const IndexType num_nonzero_blocks, 
+          WeightType *str_edge_weights, WeightType *rand_edge_weights, int num_owned, int bsize, int component, int weight_formula)
+{
+  int tid= threadIdx.x + blockDim.x*blockIdx.x;
+  
+  int i,j,kmin,kmax;
+  int bsize_sq = bsize*bsize;
+  WeightType den;
+
+  int matrix_weight_entry = component*bsize+component;
+
+  while (tid < num_nonzero_blocks)
+  {
+      i = row_indices[tid];
+      j = column_indices[tid];
+
+      if ((i != j) && (j < num_owned)) // skip diagonal and across-boundary edges
+      {
+        den = (WeightType) max(fabs(__cachingLoad(&nonzero_values[dia_values[i]*bsize_sq+matrix_weight_entry])),fabs(__cachingLoad(&nonzero_values[dia_values[j]*bsize_sq+matrix_weight_entry])));
+
+        kmin = __cachingLoad(&row_offsets[j]); //kmin = row_offsets[j];
+        kmax = __cachingLoad(&row_offsets[j+1]); //kmax = row_offsets[j+1];
+
+        WeightType kvalue = 0.0;
+        bool foundk = false; 
+        for (int k=kmin;k<kmax;k++)
+        {
+          if ((column_indices[k] == i) /* && (column_indices[k] < num_owned) */) 
+          {
+            kvalue = __cachingLoad(&nonzero_values[k*bsize_sq+matrix_weight_entry]); //kvalue = nonzero_values[k*bsize_sq+matrix_weight_entry];
+            foundk = true;
+            break;
+          }
+        }
+
+        // handles both symmetric & non-symmetric matrices
+        WeightType ed_weight=0;
+        if( foundk )
+        {
+            if( weight_formula == 0 )
+                ed_weight =  0.5*(fabs(__cachingLoad(&nonzero_values[tid*bsize_sq+matrix_weight_entry])) + fabs(kvalue)) / den; // 0.5*(aij+aji)/max(a_ii,a_jj)
+            else
+                ed_weight = -0.5 * ( __cachingLoad(&nonzero_values[tid*bsize_sq+matrix_weight_entry]) / __cachingLoad(&nonzero_values[dia_values[i]*bsize_sq+matrix_weight_entry])  + // -0.5 * ( a_ij/a_ii +
+                                    kvalue / __cachingLoad(&nonzero_values[dia_values[j]*bsize_sq+matrix_weight_entry]) );                                                            //          a_ji/a_jj )
+        }
+
+
+        // 05/09/13: Perturb the edge weights slightly to handle cases where edge weights are uniform
+        WeightType small_fraction = scaling_factor<WeightType>()*hash_val(min(i,j),max(i,j))/UINT_MAX;
+        ed_weight += small_fraction*ed_weight;
+        str_edge_weights[tid] = ed_weight;
+
+        // fill up random unique weights 
+        if( rand_edge_weights != NULL )
+          rand_edge_weights[tid] = random_weight(i, j, num_owned);
+      }
+      tid += gridDim.x*blockDim.x;
+  }
+}
+
+// Kernel to compute the weight of the edges
+// simple version modified for nvgraph
+template <typename IndexType, typename ValueType, typename WeightType>
+__global__
+void computeEdgeWeights_simple( const IndexType* row_offsets, const IndexType *row_indices, const IndexType *column_indices, 
+                            const ValueType *row_sum, const ValueType* nonzero_values, const IndexType num_nonzero_blocks, 
+          WeightType *str_edge_weights, WeightType *rand_edge_weights, int n, int weight_formula)
+{
+  int tid= threadIdx.x + blockDim.x*blockIdx.x;
+  
+  int i,j,kmin,kmax;
+  WeightType den;
+
+  while (tid < num_nonzero_blocks)
+  {
+      i = row_indices[tid];
+      j = column_indices[tid];
+
+      if ((i != j) && (j < n)) // skip diagonal and across-boundary edges
+      {
+        den = (WeightType) max(fabs(__cachingLoad(&row_sum[i])),fabs(__cachingLoad(&row_sum[j])));
+
+        kmin = __cachingLoad(&row_offsets[j]); //kmin = row_offsets[j];
+        kmax = __cachingLoad(&row_offsets[j+1]); //kmax = row_offsets[j+1];
+
+        WeightType kvalue = 0.0;
+        bool foundk = false; 
+        for (int k=kmin;k<kmax;k++)
+        {
+          if ((column_indices[k] == i) /* && (column_indices[k] < n) */) 
+          {
+            kvalue = __cachingLoad(&nonzero_values[k]); //kvalue = nonzero_values[k];
+            foundk = true;
+            break;
+          }
+        }
+
+        // handles both symmetric & non-symmetric matrices
+        WeightType ed_weight=0;
+        if( foundk )
+        {
+            if( weight_formula == 0 )
+                ed_weight =  0.5*(fabs(__cachingLoad(&nonzero_values[tid])) + fabs(kvalue)) / den; // 0.5*(aij+aji)/max(a_ii,a_jj)
+            else
+                ed_weight = -0.5 * ( __cachingLoad(&nonzero_values[tid]) / __cachingLoad(&row_sum[i])  + // -0.5 * ( a_ij/a_ii +
+                                    kvalue / __cachingLoad(&row_sum[j]) );                                                            //          a_ji/a_jj )
+        }
+
+        // 05/09/13: Perturb the edge weights slightly to handle cases where edge weights are uniform
+        WeightType small_fraction = scaling_factor<WeightType>()*hash_val(min(i,j),max(i,j))/UINT_MAX;
+        ed_weight += small_fraction*ed_weight;
+        str_edge_weights[tid] = ed_weight;
+
+        // fill up random unique weights 
+        if( rand_edge_weights != NULL )
+          rand_edge_weights[tid] = random_weight(i, j, n);
+      }
+      tid += gridDim.x*blockDim.x;
+  }
+}
+
+// Kernel to compute the weight of the edges using geometry distance between edges
+template <typename IndexType, typename ValueType>
+__global__
+void computeEdgeWeightsDistance3d( const int* row_offsets, const IndexType *column_indices, 
+                                   const ValueType* gx, const ValueType* gy, const ValueType* gz, float *str_edge_weights, int num_rows)
+{
+  int tid= threadIdx.x + blockDim.x*blockIdx.x;
+  float lx, ly, lz;  
+  float px, py, pz;
+  int kmin, kmax;
+  int col_id;
+
+  while (tid < num_rows)
+  {
+      lx = gx[tid];
+      ly = gy[tid];
+      lz = gz[tid];
+      kmin = row_offsets[tid];
+      kmax = row_offsets[tid+1];
+
+      for (int k=kmin;k<kmax;k++)
+      {
+        col_id = column_indices[k];
+        if (col_id != tid)      // skip diagonal
+        {
+          px = gx[col_id];
+          py = gy[col_id];
+          pz = gz[col_id];
+
+          str_edge_weights[k] =  1.0 / sqrt((px - lx)*(px - lx) + (py - ly)*(py - ly) + (pz - lz)*(pz - lz));
+        }
+      }
+      tid += gridDim.x*blockDim.x;
+  }
+}
+
+// Kernel that checks if perfect matchs exist
+template <typename IndexType>
+__global__
+void matchEdges(const IndexType num_rows, IndexType *partner_index, IndexType *aggregates, const IndexType *strongest_neighbour)
+{
+  int potential_match, potential_match_neighbour;
+
+  for (int tid= threadIdx.x + blockDim.x*blockIdx.x; tid < num_rows; tid += gridDim.x*blockDim.x)
+  {
+    if (partner_index[tid] == -1) // Unaggregated row
+    {
+      potential_match = strongest_neighbour[tid];
+      if (potential_match!=-1)
+      {
+          potential_match_neighbour = strongest_neighbour[potential_match];
+
+          if ( potential_match_neighbour == tid ) // we have a match
+          {
+            partner_index[tid] = potential_match;
+            aggregates[tid] = ( potential_match > tid) ? tid : potential_match;
+          }
+      }
+    }
+  }
+}
+
+template <typename IndexType>
+__global__
+void joinExistingAggregates(IndexType num_rows, IndexType *aggregates, IndexType *aggregated, const IndexType *aggregates_candidate)
+{
+   int tid= threadIdx.x + blockDim.x*blockIdx.x;
+  
+  while (tid < num_rows)
+  {
+    if (aggregated[tid] == -1 && aggregates_candidate[tid] != -1) // Unaggregated row
+    {
+      aggregates[tid] = aggregates_candidate[tid];
+      aggregated[tid] = 1;
+    }
+
+  tid += gridDim.x*blockDim.x;
+  }
+}
+
+
+template<typename IndexType>
+__global__
+void aggregateSingletons( IndexType* aggregates, IndexType numRows )
+{
+    int tid = threadIdx.x + blockDim.x*blockIdx.x;
+
+    while( tid < numRows )
+    {
+        if( aggregates[tid] == -1 ) //still unaggregated!
+            aggregates[tid] = tid; //then become a singleton
+
+        tid += gridDim.x*blockDim.x;
+    }
+}
+
+__device__
+float random_weight2(int i, int j)
+{
+#define RAND_MULTIPLIER         1145637293
+  unsigned long i_min = (min(i, j) * RAND_MULTIPLIER);
+  unsigned long i_max = (max(i, j) * RAND_MULTIPLIER);
+  return ((float)i_min / i_max);
+}
+
+
+// findStrongestNeighbour kernel for block_dia_csr_matrix format
+// Reads the weight from edge_weights array
+template <typename IndexType>
+__global__
+void findStrongestNeighbourBlockDiaCsr_V2(const IndexType *row_offsets, const IndexType *column_indices,
+                                        const float *edge_weights, IndexType n, IndexType *aggregates, 
+          IndexType *strongest_neighbour_1phase, IndexType *strongest_neighbour,
+          const size_t bsize, int phase, bool merge_singletons)
+{
+  int tid= threadIdx.x + blockDim.x*blockIdx.x;
+  
+  float   weight;
+  int jcol;
+
+  while (tid < n)
+  {
+    int strongest_unaggregated = -1;
+    int strongest_aggregated = -1;
+    float   max_weight_unaggregated = 0.;
+    float   max_weight_aggregated = 0.;
+    if (aggregates[tid] == -1) // Unaggregated row
+    {
+      for (int j=row_offsets[tid]; j<row_offsets[tid+1]; j++)
+      {
+        //TODO: check if aggregated before computing the weight
+        jcol = column_indices[j];
+        if (phase == 1) weight = edge_weights[j];
+        else weight = random_weight2(tid, jcol);
+
+        if (tid == jcol || jcol >= n) continue;  // skip diagonal and halo
+        if (phase == 2 && strongest_neighbour_1phase[jcol] != tid) continue; // if 2nd phase only accept those who gave a hand on the 1st phase
+
+        // Identify strongest aggregated and unaggregated neighbours
+        if (aggregates[jcol] == -1 && (weight > max_weight_unaggregated || (weight==max_weight_unaggregated && jcol > strongest_unaggregated))) // unaggregated
+        {
+          max_weight_unaggregated= weight;
+          strongest_unaggregated= jcol;
+        }
+        else if (aggregates[jcol] != -1 && (weight > max_weight_aggregated || (weight==max_weight_aggregated && jcol > strongest_aggregated))) // aggregated
+        {
+           max_weight_aggregated = weight; 
+           strongest_aggregated = jcol;
+        }
+      }
+      if (strongest_unaggregated == -1 && strongest_aggregated != -1) // All neighbours are aggregated
+      {
+        if( merge_singletons )
+            // Put in same aggregate as strongest neighbour
+            aggregates[tid] = aggregates[strongest_aggregated];
+        else
+            aggregates[tid] = tid;
+      }
+      else if (strongest_unaggregated != -1) {
+        if (phase == 2) {
+            float rand_w1 = random_weight2(tid, strongest_neighbour_1phase[tid]); 
+          strongest_neighbour[tid] = max_weight_unaggregated > rand_w1 ? strongest_unaggregated : strongest_neighbour_1phase[tid];
+        }
+      else strongest_neighbour_1phase[tid] = strongest_unaggregated;
+      }
+      else {
+        if (phase == 2) strongest_neighbour[tid] = strongest_neighbour_1phase[tid];
+      else strongest_neighbour_1phase[tid] = tid;
+      }
+    }
+  tid += gridDim.x*blockDim.x;
+  }
+}
+
+// Kernel that checks if perfect matchs exist
+template <typename IndexType>
+__global__
+void matchEdges(const IndexType num_rows, IndexType *aggregates, const int *strongest_neighbour)
+{
+  int tid= threadIdx.x + blockDim.x*blockIdx.x;
+  int potential_match, potential_match_neighbour;
+
+  while (tid < num_rows)
+  {
+    if (aggregates[tid] == -1) // Unaggregated row
+    {
+      potential_match = strongest_neighbour[tid];
+      potential_match_neighbour = strongest_neighbour[potential_match];
+
+      if (potential_match != -1 && potential_match_neighbour == tid) // we have a match
+        aggregates[tid] = ( potential_match > tid ) ? tid : potential_match;
+      /*
+      if (potential_match != -1){
+          potential_match_neighbour = strongest_neighbour[potential_match];
+
+          if (potential_match_neighbour == tid) // we have a match
+              aggregates[tid] = ( potential_match > tid ) ? tid : potential_match;
+      }
+      */
+    }
+  tid += gridDim.x*blockDim.x;
+  }
+}
+
+template <typename IndexType, int block_size>
+__global__
+void countAggregates(const IndexType num_rows, const IndexType *aggregates, int *num_unaggregated)
+{
+  int tid = threadIdx.x + blockDim.x * blockIdx.x; 
+  int c = 0;
+  int i = tid;
+  while( i < num_rows ) {
+    c += ( aggregates[i] == -1 );   
+    i += gridDim.x * blockDim.x;
+  }
+  __shared__ volatile int smem[block_size];
+  smem[threadIdx.x] = c;  
+  __syncthreads();
+
+  for( int off = blockDim.x / 2; off >= 32; off = off / 2 ) {
+    if( threadIdx.x < off ) 
+      smem[threadIdx.x] += smem[threadIdx.x + off];
+    __syncthreads();
+  }
+
+  // warp reduce
+  if( threadIdx.x < 32 ) {
+    smem[threadIdx.x] += smem[threadIdx.x+16];
+    smem[threadIdx.x] += smem[threadIdx.x+8];
+    smem[threadIdx.x] += smem[threadIdx.x+4];
+    smem[threadIdx.x] += smem[threadIdx.x+2];
+    smem[threadIdx.x] += smem[threadIdx.x+1];
+  }
+
+  if( threadIdx.x == 0 )
+    atomicAdd(num_unaggregated, smem[0]);
+}
+
+
+template <typename IndexType>
+__global__
+void joinExistingAggregates(IndexType num_rows, IndexType *aggregates, const IndexType *aggregates_candidate)
+{
+   int tid= threadIdx.x + blockDim.x*blockIdx.x;
+  
+  while (tid < num_rows)
+  {
+    if (aggregates[tid] == -1 && aggregates_candidate[tid] != -1) // Unaggregated row
+      aggregates[tid] = aggregates_candidate[tid];
+
+    tid+=gridDim.x*blockDim.x;
+  }
+}
+
+
+
+// Kernel that merges unaggregated vertices its strongest aggregated neighbour
+// Weights are read from edge_weights array
+// For block_dia_csr_matrix_format
+template <typename IndexType>
+__global__
+void mergeWithExistingAggregatesBlockDiaCsr_V2(const IndexType *row_offsets, const IndexType *column_indices, const float *edge_weights,
+                                            const int n, IndexType *aggregates, int bsize, const int deterministic, IndexType *aggregates_candidate)
+{
+  int tid= threadIdx.x + blockDim.x*blockIdx.x;
+  
+  int jcol;
+  float weight;
+  
+  while (tid < n)
+  {
+    float max_weight_aggregated = 0.;
+    int strongest_aggregated = -1;
+    if (aggregates[tid] == -1) // Unaggregated row
+    {
+      for (int j=row_offsets[tid]; j<row_offsets[tid+1]; j++)
+      {
+        // Compute edge weight
+        weight = edge_weights[j];
+        jcol = column_indices[j];
+
+        if (jcol == tid || jcol >= n) continue;  // skip diagonal
+
+        // Identify strongest aggregated neighbour
+        if (aggregates[jcol] != -1 && (weight > max_weight_aggregated || (weight==max_weight_aggregated && jcol > strongest_aggregated))) // 
+        {
+           max_weight_aggregated = weight; 
+           strongest_aggregated = jcol;
+        }
+      }
+
+      if (strongest_aggregated != -1) // Found a neighbour to aggregate to
+      {
+        if (deterministic) {
+          aggregates_candidate[tid] = aggregates[strongest_aggregated];
+        }
+        else {
+          // Put in same aggregate as strongest neighbour
+          aggregates[tid] = aggregates[strongest_aggregated];
+        }
+      }
+      else // All neighbours are unaggregated, leave alone
+      {
+        if (deterministic)
+          aggregates_candidate[tid] = tid;
+        else
+          aggregates[tid] = tid; 
+      }
+
+
+    }
+  tid += gridDim.x*blockDim.x;
+  }
+}
+
+
+
+template <typename INDEX_TYPE>
+__global__ void computeDiagonalKernelCSR(INDEX_TYPE num_rows, const INDEX_TYPE *row_offsets, const INDEX_TYPE *col_indices, INDEX_TYPE *diag) {
+
+  INDEX_TYPE row=(blockIdx.x*blockDim.x+threadIdx.x);
+
+  while(row<num_rows) {
+    int nz=row_offsets[row];
+    int last_nz=row_offsets[row+1];
+    //diag[row] = null_index;
+    while(nz<last_nz) {
+      int col=col_indices[nz];
+      if(row==col) {
+        diag[row]=nz;
+        //diag_end_offsets[row]=nz+1;
+        break;
+      }
+      nz++;
+    }
+    row+=blockDim.x*gridDim.x;
+  }
+}
+
+template <typename T1, typename T2>
+__global__ void convert_type(int n, const T1 *src, T2 *dest) {
+
+  int tid=(blockIdx.x*blockDim.x+threadIdx.x);
+  while(tid<n) 
+  {
+    dest[tid] = static_cast<T2>(src[tid]);
+    tid += gridDim.x*blockDim.x;
+  }
+}
+
+/*
+
+// findStrongestNeighbour kernel for block_dia_csr_matrix format
+// Reads the weight from edge_weights array
+template <typename IndexType>
+__global__
+void agreeOnProposal(const IndexType *row_offsets, const IndexType *column_indices,
+                                        IndexType num_block_rows, IndexType *aggregated, int *strongest_neighbour, float *weight_strongest_neighbour, IndexType *partner_index, int *aggregates)
+{
+  int tid= threadIdx.x + blockDim.x*blockIdx.x;
+  int partner;
+
+  while(tid < num_block_rows)
+  {
+    if (aggregated[tid] == -1)
+    {
+      partner = partner_index[tid];
+      float my_weight = weight_strongest_neighbour[tid];
+      float partners_weight = -1;
+      if (partner != -1) partners_weight = weight_strongest_neighbour[partner];
+
+        if (my_weight < 0. && partners_weight < 0.) { // All neighbours are aggregated, leave in current aggregate
+        //if (deterministic!=1)
+        //{
+          aggregated[tid] = 1; 
+          strongest_neighbour[tid] = -1;
+          partner_index[tid+num_block_rows] = tid;
+          partner_index[tid+2*num_block_rows] = tid;
+        //}
+        }
+        // if my weight is smaller than my partner's weight, change my strongest neighbour
+        else if (my_weight < partners_weight)
+          strongest_neighbour[tid] = strongest_neighbour[partner];
+
+    }
+  tid += gridDim.x*blockDim.x;
+  }
+}
+
+// Kernel that checks if perfect matchs exist
+template <typename IndexType>
+__global__
+void matchAggregates(IndexType *aggregates, IndexType *aggregated, IndexType *strongest_neighbour, const IndexType num_rows)
+{
+  int tid= threadIdx.x + blockDim.x*blockIdx.x;
+  int potential_match, potential_match_neighbour, my_aggregate;
+
+  while (tid < num_rows)
+  {
+    if (aggregated[tid] == -1) // Unaggregated row
+    {
+
+      potential_match = strongest_neighbour[tid];
+      if (potential_match!=-1)
+      {
+          potential_match_neighbour = strongest_neighbour[potential_match];
+
+          my_aggregate = aggregates[tid];
+
+          if (potential_match_neighbour == my_aggregate) // we have a match
+          {
+            aggregated[tid] = 1;
+            aggregates[tid] = ( potential_match > my_aggregate) ? my_aggregate: potential_match;
+          }
+      }
+    }
+  tid += gridDim.x*blockDim.x;
+  }
+}
+
+// Kernel that checks if perfect matchs exist
+template <typename IndexType>
+__global__
+void assignUnassignedVertices(IndexType *partner_index, const IndexType num_rows)
+{
+  int tid= threadIdx.x + blockDim.x*blockIdx.x;
+
+  while (tid < num_rows)
+  {
+    if (partner_index[tid] == -1) // Unaggregated row
+    {
+      partner_index[tid] = tid;
+    }
+  tid += gridDim.x*blockDim.x;
+  }
+}
+
+// Kernel that merges unaggregated vertices its strongest aggregated neighbour
+// Edge weights are computed on the fly
+// For block_dia_csr_matrix_format
+template <typename IndexType, typename ValueType>
+__global__
+void mergeWithExistingAggregatesBlockDiaCsr(const IndexType *row_offsets, const IndexType *column_indices, const ValueType *dia_values, const ValueType *nonzero_values,
+                                            const int n, IndexType *aggregates, int bsize, int deterministic, IndexType *aggregates_candidate)
+{
+  int tid= threadIdx.x + blockDim.x*blockIdx.x;
+  
+  int jcol;
+  ValueType weight;
+  int bsize_sq = bsize*bsize;
+
+  while (tid < n)
+  {
+    int strongest_aggregated = -1;
+    ValueType max_weight_aggregated = 0.;
+    if (aggregates[tid] == -1) // Unaggregated row
+    {
+      for (int j=row_offsets[tid]; j<row_offsets[tid+1]; j++)
+      {
+        jcol = column_indices[j];
+        if (jcol >= n) continue;
+        // Compute edge weight
+        weight = fabs(nonzero_values[j*bsize_sq])/max( fabs(dia_values[tid*bsize_sq]),fabs(dia_values[jcol*bsize_sq]));
+
+        // Identify strongest aggregated neighbour
+        if (aggregates[jcol] != -1 && (weight > max_weight_aggregated || (weight==max_weight_aggregated && jcol > strongest_aggregated))) // aggregated
+        {
+           max_weight_aggregated = weight; 
+           strongest_aggregated = jcol;
+        }
+      }
+
+      if (strongest_aggregated != -1) // Found a neighbour to aggregate to
+      {
+        if (deterministic) {
+          aggregates_candidate[tid] = aggregates[strongest_aggregated];
+        }
+        else {
+          // Put in same aggregate as strongest neighbour
+          aggregates[tid] = aggregates[strongest_aggregated];
+        }
+      }
+      else // All neighbours are unaggregated, leave alone
+      {
+        if (deterministic)
+          aggregates_candidate[tid] = tid;
+        else
+          aggregates[tid] = tid; 
+      }
+    }
+  tid += gridDim.x*blockDim.x;
+  }
+}
+
+// findStrongestNeighbour kernel for block_dia_csr_matrix format
+// Reads the weight from edge_weights array
+template <typename IndexType>
+__global__
+void findStrongestNeighbourBlockDiaCsr_NoMerge(const IndexType *row_offsets, const IndexType *column_indices,
+                                        float *edge_weights, const IndexType num_block_rows, IndexType* partner_index, int *strongest_neighbour, int deterministic)
+{
+  int tid= threadIdx.x + blockDim.x*blockIdx.x;
+  int jmin,jmax; 
+  float weight;
+
+  int jcol;
+
+  while (tid < num_block_rows)
+  {
+    float max_weight_unaggregated = 0.;
+    int strongest_unaggregated = -1;
+
+    if (partner_index[tid] == -1) // Unaggregated row
+    {
+      jmin = row_offsets[tid];
+      jmax = row_offsets[tid+1];
+
+      for (int j=jmin; j<jmax; j++)
+      {
+        jcol = column_indices[j];
+        if (tid == jcol || jcol >= num_block_rows) continue; // Skip diagonal and boundary edges.
+        weight = edge_weights[j];
+        // Identify strongest unaggregated neighbours
+        if (partner_index[jcol] == -1 && (weight > max_weight_unaggregated || (weight==max_weight_unaggregated && jcol > strongest_unaggregated))) // unaggregated
+        {
+          max_weight_unaggregated= weight;
+          strongest_unaggregated= jcol;
+        }
+      }
+
+      if (strongest_unaggregated == -1) // All neighbours are aggregated
+      {
+        // Put in its own aggregate
+        if (!deterministic)
+          partner_index[tid] = tid;
+      }
+      else
+      {
+        strongest_neighbour[tid] = strongest_unaggregated;
+      }
+
+      //if (strongest_unaggregated != -1) // All neighbours are aggregated
+      //  strongest_neighbour[tid] = strongest_unaggregated;
+        // Put in its own aggregate
+      //  partner_index[tid] = tid;
+      //else
+
+
+    }
+
+  tid += gridDim.x*blockDim.x;
+  }
+}
+
+// findStrongestNeighbour kernel for block_dia_csr_matrix format
+// Reads the weight from edge_weights array
+template <typename IndexType>
+__global__
+void findStrongestNeighbourBlockDiaCsr_StoreWeight(const IndexType *row_offsets, const IndexType *column_indices,
+                                        const float *edge_weights, const IndexType num_block_rows, IndexType *aggregated, IndexType *aggregates, int *strongest_neighbour, IndexType *partner_index, float *weight_strongest_neighbour, int deterministic)
+{
+  int tid= threadIdx.x + blockDim.x*blockIdx.x;
+  
+  float weight;
+
+  int jcol,jmin,jmax;
+  int agg_jcol;
+
+  while (tid < num_block_rows)
+  {
+    float max_weight_unaggregated = 0.;
+    float max_weight_aggregated = 0.;
+    int strongest_unaggregated = -1;
+    int strongest_aggregated = -1;
+    int partner = -1;
+    if (aggregated[tid] == -1) // Unaggregated row
+    {
+      partner = partner_index[tid];
+      jmin = row_offsets[tid];
+      jmax = row_offsets[tid+1];
+
+      for (int j=jmin; j<jmax; j++)
+      {
+        jcol = column_indices[j];
+        if (tid == jcol || jcol >= num_block_rows) continue; // Skip diagonal and boundary edges.
+        weight = edge_weights[j];
+
+        agg_jcol = aggregated[jcol];
+
+        if (agg_jcol == -1 && jcol != partner && (weight > max_weight_unaggregated || (weight==max_weight_unaggregated && jcol > strongest_unaggregated))) // unaggregated
+        {
+            max_weight_unaggregated= weight;
+            strongest_unaggregated= jcol;
+        }
+        else if (agg_jcol != -1 && jcol != partner && (weight > max_weight_aggregated || (weight==max_weight_aggregated && jcol > strongest_aggregated))) // unaggregated
+        {
+           max_weight_aggregated = weight; 
+           strongest_aggregated = jcol;
+        }
+      }
+
+      if (strongest_unaggregated== -1) // All neighbours are aggregated
+      {
+        if (!deterministic)
+        {
+          if (strongest_aggregated != -1) {
+            aggregates[tid] = aggregates[strongest_aggregated];
+            aggregated[tid] = 1;
+            if (partner != -1) {
+              aggregates[partner] = aggregates[strongest_aggregated];
+              aggregated[partner] = 1;
+      }
+          }
+          else {// leave in its own aggregate
+            if (partner != -1) 
+        aggregated[partner] = 1; 
+            aggregated[tid] = 1; 
+          }
+        }
+
+      }
+      else // Found an unaggregated aggregate
+      {
+        weight_strongest_neighbour[tid] = max_weight_unaggregated;
+        strongest_neighbour[tid] = aggregates[strongest_unaggregated];
+      }
+    }
+  tid += gridDim.x*blockDim.x;
+  }
+}
+
+// findStrongestNeighbour kernel for block_dia_csr_matrix format
+// computes weight on the fly
+template <typename IndexType, typename ValueType>
+__global__
+void findStrongestNeighbourBlockDiaCsr(const IndexType *row_offsets, const IndexType *column_indices, 
+                            const ValueType *dia_values, const ValueType *nonzero_values, const IndexType n, IndexType *aggregates, int *strongest_neighbour, int bsize)
+{
+  int tid= threadIdx.x + blockDim.x*blockIdx.x;
+  
+  ValueType weight;
+
+  int jcol;
+  int bsize_sq = bsize*bsize;
+
+  while (tid < n)
+  {
+    ValueType max_weight_unaggregated = 0.;
+    ValueType max_weight_aggregated = 0.;
+    int strongest_unaggregated = -1;
+    int strongest_aggregated = -1;
+    if (aggregates[tid] == -1) // Unaggregated row
+    {
+      for (int j=row_offsets[tid]; j<row_offsets[tid+1]; j++)
+      {
+        jcol = column_indices[j];
+        if (jcol >= n) continue;
+
+        // Compute edge weight
+        for (int k=row_offsets[jcol];k<row_offsets[jcol+1];k++)
+        {
+          if (column_indices[k] == tid)
+          {
+            weight = 0.5*(fabs(nonzero_values[j*bsize_sq]) + fabs(nonzero_values[k*bsize_sq]))
+                     / max( fabs(dia_values[tid*bsize_sq]),fabs(dia_values[jcol*bsize_sq]));
+            break;
+          }
+        }
+
+        // Identify strongest aggregated and unaggregated neighbours
+        if (aggregates[jcol] == -1 && (weight > max_weight_unaggregated || (weight==max_weight_unaggregated && jcol > strongest_unaggregated))) // unaggregated
+        {
+          max_weight_unaggregated= weight;
+          strongest_unaggregated= jcol;
+        }
+        else if (aggregates[jcol] != -1 && (weight > max_weight_aggregated || (weight==max_weight_aggregated && jcol > strongest_aggregated))) // aggregated
+        {
+           max_weight_aggregated = weight; 
+           strongest_aggregated = jcol;
+        }
+      }
+      if (strongest_unaggregated == -1 && strongest_aggregated != -1) // All neighbours are aggregated
+        // Put in same aggregate as strongest neighbour
+        aggregates[tid] = aggregates[strongest_aggregated];
+      else if (strongest_unaggregated != -1)
+        strongest_neighbour[tid] = strongest_unaggregated;
+      else
+        strongest_neighbour[tid] = tid;
+    }
+  tid += gridDim.x*blockDim.x;
+  }
+}
+
+// Kernel that merges unaggregated vertices its strongest aggregated neighbour
+// Weights are read from edge_weights array
+// For block_dia_csr_matrix_format
+template <typename IndexType>
+__global__
+void mergeWithExistingAggregatesBlockDiaCsr(const IndexType *row_offsets, const IndexType *column_indices, const float *edge_weights,
+                                            const int num_block_rows, IndexType *aggregates, IndexType *aggregated, int deterministic, IndexType *aggregates_candidate, bool allow_singletons = true)
+{
+  int tid= threadIdx.x + blockDim.x*blockIdx.x;
+  
+  int jcol;
+
+  float weight;
+
+  while (tid < num_block_rows)
+  {
+    float max_weight_aggregated = 0.;
+    int strongest_aggregated = -1;
+    if (aggregated[tid] == -1) // Unaggregated row
+    {
+      for (int j=row_offsets[tid]; j<row_offsets[tid+1]; j++)
+      {
+        jcol = column_indices[j];
+        if (tid == jcol || jcol >= num_block_rows) continue; // Skip diagonal and boundary edges.
+        // Identify strongest aggregated neighbour
+        if (aggregated[jcol] != -1) {
+
+          weight = edge_weights[j];
+          if (weight > max_weight_aggregated || (weight == max_weight_aggregated && jcol > strongest_aggregated)) {
+           max_weight_aggregated = weight; 
+           strongest_aggregated = jcol;
+          }
+
+        }
+      }
+
+      if (strongest_aggregated != -1) {
+        if (deterministic)
+        {
+          aggregates_candidate[tid] = aggregates[strongest_aggregated];
+        }
+        else
+        {
+          // Put in same aggregate as strongest neighbour
+          aggregates[tid] = aggregates[strongest_aggregated];
+          aggregated[tid] = 1;
+        }
+      }
+      else // All neighbours are unaggregated, leave alone
+      {
+        if (deterministic) {
+          if (allow_singletons) aggregates_candidate[tid] = tid;
+        }
+        else
+          aggregates[tid] = tid; 
+      }
+
+    }
+  tid += gridDim.x*blockDim.x;
+  }
+}
+
+// Kernel to extract diagonal for csr_matrix format
+template <typename IndexType, typename ValueType>
+__global__
+void getDiagonalKernel(const IndexType *offsets, const IndexType *column_indices,
+                       const ValueType *values, const IndexType numRows, ValueType *diagonal)
+{
+  int tIdx = threadIdx.x + blockDim.x*blockIdx.x;
+
+  while (tIdx < numRows)
+  {
+    const int offset = offsets[tIdx];
+    const int numj = offsets[tIdx+1]-offset;
+
+    for (int j=offset; j < offset+numj; j++)
+    {
+      int jcol = column_indices[j];
+      if (tIdx == jcol)
+      {
+        diagonal[tIdx] = values[j];
+      }
+    }
+  tIdx += gridDim.x*blockDim.x;
+  }
+}
+
+template <typename INDEX_TYPE>
+__global__ void computeDiagonalKernelCOO(INDEX_TYPE num_nz, INDEX_TYPE *row_indices, INDEX_TYPE *col_indices, INDEX_TYPE *diag) {
+  //BLOCKY*BLOCKX threads per nz
+  INDEX_TYPE nz=(blockIdx.x*blockDim.x+threadIdx.x);
+
+  while(nz<num_nz) {
+    INDEX_TYPE row=row_indices[nz];
+    INDEX_TYPE col=col_indices[nz];
+
+    if(row==col) {
+      //copy block to diag
+      diag[row]=nz;
+      //diag_end_offsets[row]=nz+1;
+    }
+
+    nz+=blockDim.x*gridDim.x;
+  }
+}
+
+// Kernel to extract diagonal for csr_matrix format
+template <typename IndexType, typename ValueType>
+__global__
+void getDiagonalKernelNoDiaProp(const IndexType *dia_idx, const ValueType *values, const IndexType numRows, ValueType *diagonal)
+{
+  int tIdx = threadIdx.x + blockDim.x*blockIdx.x;
+
+  while (tIdx < numRows)
+  {
+    diagonal[tIdx] = values[dia_idx[tIdx]];
+    tIdx += gridDim.x*blockDim.x;
+  }
+}
+
+
+
+*/
diff --git a/cpp/nvgraph/cpp/include/csr_graph.hxx b/cpp/nvgraph/cpp/include/csr_graph.hxx
new file mode 100644
index 00000000000..db77baed371
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/csr_graph.hxx
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+#pragma once
+
+#include "graph.hxx"
+#include <cnmem_shared_ptr.hxx> // interface with CuMem (memory pool lib) for shared ptr
+
+namespace nvgraph
+{
+
+/*! A CsrGraph is a graph strored in a CSR data structure.
+    It represents an unweighted graph and has storage for row_offsets and column_indices 
+ */
+template <typename IndexType_>
+class CsrGraph : public nvgraph::Graph<IndexType_>
+{
+public:
+    typedef IndexType_ IndexType;
+
+private:
+    typedef nvgraph::Graph<IndexType> Parent;
+
+protected:
+    /*! Storage for the cuda stream
+     */
+    cudaStream_t stream_;
+
+    /*! Storage for the row offsets of the CSR data structure.  Also called the "row pointer" array.
+     */
+    SHARED_PREFIX::shared_ptr<IndexType> row_offsets;
+
+    /*! Storage for the column indices of the CSR data structure.
+     */
+    SHARED_PREFIX::shared_ptr<IndexType> column_indices;
+
+public:
+        
+    /*! Construct an empty \p CsrGraph.
+     */
+    CsrGraph(void) {}
+
+    /*! Destruct an empty \p CsrGraph.
+     */
+    ~CsrGraph(void) {}
+
+    /*! Construct a \p CsrGraph with a specific shape and number of nonzero entries.
+     *  \param num_rows Number of rows.
+     *  \param num_cols Number of columns.
+     *  \param num_entries Number of nonzero graph entries.
+     */
+    CsrGraph(size_t num_rows, size_t num_entries, cudaStream_t stream, bool external = false)
+        : Parent(num_rows, num_entries),
+          stream_(stream)
+          {
+              if (external)
+              {
+                row_offsets = nullptr;
+                column_indices = nullptr;
+              }
+              else
+              {
+                row_offsets = allocateDevice<IndexType>((num_rows+1), NULL);
+                column_indices = allocateDevice<IndexType>(num_entries, NULL);
+              }
+          }
+
+
+    /*! Construct a \p CsrGraph from another graph.
+     *
+     *  \param CsrGraph Another graph in csr
+     */
+    CsrGraph(const CsrGraph& gr): 
+        Parent(gr),
+        row_offsets(gr.row_offsets),
+        column_indices(gr.column_indices)
+    {}
+
+    /*! Construct a \p CsrGraph from another graph.
+     *
+     *  \param CsrGraph Another graph in csr
+     */
+    CsrGraph(const Parent& gr): 
+       Parent(gr)
+      // row_offsets(allocateDevice<IndexType>((gr.get_num_vertices()+1), NULL)),
+      // column_indices(allocateDevice<IndexType>(gr.get_num_edges(), NULL))
+    {}
+
+    inline void allocate_row_offsets() 
+    {
+         row_offsets = allocateDevice<IndexType>(this->get_num_vertices()+1, NULL);
+    }
+    inline void allocate_column_indices() 
+    {
+        column_indices = allocateDevice<IndexType>(this->get_num_edges(), NULL);
+    }
+    inline IndexType* get_raw_row_offsets() { return row_offsets.get(); }
+    inline IndexType* get_raw_column_indices() { return column_indices.get(); }
+    inline void set_raw_row_offsets(IndexType* ptr) { row_offsets = attachDevicePtr<IndexType>(ptr, stream_); }
+    inline void set_raw_column_indices(IndexType* ptr) {column_indices = attachDevicePtr<IndexType>(ptr, stream_); }
+    inline const IndexType* get_raw_row_offsets()  const { return row_offsets.get(); }
+    inline const IndexType* get_raw_column_indices()  const { return column_indices.get(); }
+    inline cudaStream_t get_stream() const { return stream_; }
+
+    /*! Resize graph dimensions and underlying storage
+     *
+     *  \param num_rows Number of rows.
+     *  \param num_cols Number of columns.
+     *  \param num_entries Number of nonzero graph entries.
+     */
+    // We should try not to resize CSR graphs in general
+    // void resize(const size_t num_rows, const size_t num_entries);
+
+    /*! Swap the contents of two \p CsrGraph objects.
+     *
+     *  \param graph Another graph in csr 
+     */
+    void swap(CsrGraph& graph);
+
+    /*! Assignment from another graph.
+     *
+     *  \param graph Another graph in csr
+     */
+    CsrGraph& operator=(const CsrGraph& graph);
+
+    //Accept method injection
+    DEFINE_VISITABLE(IndexType_)
+
+}; // class CsrGraph
+} // end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/include/csrmv_cub.h b/cpp/nvgraph/cpp/include/csrmv_cub.h
new file mode 100644
index 00000000000..41e8e096daf
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/csrmv_cub.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "nvgraph.h"
+#include "nvgraph_error.hxx"
+#include "multi_valued_csr_graph.hxx"
+
+namespace nvgraph
+{
+
+template <typename I, typename V>
+class SemiringDispatch
+{
+public:
+    template <typename SR>
+    static NVGRAPH_ERROR Dispatch(
+        const V*             d_values,
+        const I*             d_row_offsets,
+        const I*             d_column_indices,
+        const V*             d_vector_x,
+        V*             d_vector_y,
+        V              alpha,
+        V              beta, 
+        I              num_rows,
+        I              num_cols,
+        I              num_nonzeros,
+        cudaStream_t   stream);
+
+    static NVGRAPH_ERROR InitAndLaunch(
+            const nvgraph::MultiValuedCsrGraph<I, V> &graph,
+            const size_t weight_index,
+            const void *p_alpha,
+            const size_t x_index,
+            const void *p_beta,
+            const size_t y_index,
+            const nvgraphSemiring_t SR,
+            cudaStream_t stream
+        );
+};
+
+
+// API wrapper to avoid bloating main API object nvgraph.cpp
+NVGRAPH_ERROR SemiringAPILauncher(nvgraphHandle_t handle,
+                           const nvgraphGraphDescr_t descrG,
+                           const size_t weight_index,
+                           const void *alpha,
+                           const size_t x,
+                           const void *beta,
+                           const size_t y,
+                           const nvgraphSemiring_t sr);
+} //namespace nvgraph
diff --git a/cpp/nvgraph/cpp/include/debug_help.h b/cpp/nvgraph/cpp/include/debug_help.h
new file mode 100644
index 00000000000..09e3c203258
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/debug_help.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ /*
+ * debug_help.h
+ *
+ *  Created on: Jul 19, 2018
+ *      Author: jwyles
+ */
+
+#include <string>
+#include <iostream>
+
+#pragma once
+
+namespace debug {
+	template <typename T>
+	void printDeviceVector(T* dev_ptr, int items, std::string title) {
+		T* host_ptr = (T*)malloc(sizeof(T) * items);
+		cudaMemcpy(host_ptr, dev_ptr, sizeof(T) * items, cudaMemcpyDefault);
+		std::cout << title << ": { ";
+		for (int i = 0; i < items; i++) {
+			std::cout << host_ptr[i] << ((i < items - 1) ? ", " : " ");
+		}
+		std::cout << "}\n";
+		free(host_ptr);
+	}
+}
diff --git a/cpp/nvgraph/cpp/include/debug_macros.h b/cpp/nvgraph/cpp/include/debug_macros.h
new file mode 100644
index 00000000000..7d2be79343d
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/debug_macros.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ #pragma once
+
+#include "nvgraph_error.hxx"
+
+#define CHECK_STATUS(...)                                                      \
+    do {                                                                       \
+        if (__VA_ARGS__) {                                                     \
+            FatalError(#__VA_ARGS__, NVGRAPH_ERR_UNKNOWN);                        \
+        }                                                                      \
+    } while (0)
+
+#define CHECK_NVGRAPH(...)                                                        \
+    do {                                                                       \
+        NVGRAPH_ERROR e = __VA_ARGS__;                                            \
+        if (e != NVGRAPH_OK) {                                                    \
+            FatalError(#__VA_ARGS__, e)                                        \
+        }                                                                      \
+    } while (0)
+
+#ifdef DEBUG
+#define COUT() (std::cout)
+#define CERR() (std::cerr)
+#define WARNING(message)                                                       \
+    do {                                                                       \
+        std::stringstream ss;                                                  \
+        ss << "Warning (" << __FILE__ << ":" << __LINE__ << "): " << message;  \
+        CERR() << ss.str() << std::endl;                                       \
+    } while (0)
+#else // DEBUG
+#define WARNING(message)
+#endif
diff --git a/cpp/nvgraph/cpp/include/delta_modularity.cuh b/cpp/nvgraph/cpp/include/delta_modularity.cuh
new file mode 100644
index 00000000000..b396757b30b
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/delta_modularity.cuh
@@ -0,0 +1,480 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_profiler_api.h>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/generate.h>
+#include <thrust/transform.h>
+
+#include "util.cuh"
+#include "graph_utils.cuh"
+#include "functor.cuh"
+//#include "block_delta_modularity.cuh"
+
+#include <cusparse.h>
+
+
+namespace nvlouvain{
+
+
+/*************************************************************
+*
+*  compute k_i_in
+*
+*  - input :
+*     n_vertex
+*     csr_ptr's ptr 
+*     csr_idx's ptr
+*     csr_val's ptr
+*     cluster's ptr : current cluster assignment
+*     c: target cluster
+*     i: current vertex
+*
+*  - output:
+*     results: k i in c
+*
+***************************************************************/
+
+template<typename IdxType, typename ValType> 
+__device__ void compute_k_i_in( const int n_vertex, 
+                                IdxType* csr_ptr_ptr, 
+                                IdxType* csr_idx_ptr, 
+                                ValType* csr_val_ptr, 
+                                IdxType* cluster_ptr, 
+                                IdxType c, // tid.y
+                                IdxType i, // tid.x 
+                                ValType* result){
+  ValType sum = 0.0;
+  //Sanity check
+  if( i < n_vertex ){
+
+    IdxType i_start = *(csr_ptr_ptr + i);
+    IdxType i_end = *(csr_ptr_ptr + i + 1);
+    
+#pragma unroll    
+    for(int j = 0; j < i_end - i_start; ++j){
+      IdxType j_idx = *(csr_idx_ptr + i_start + j);
+      IdxType   c_j = *(cluster_ptr + j_idx);
+      sum += (int)(c_j==c)*((ValType)(*(csr_val_ptr + i_start + j)));
+    }
+    *result = sum;
+  }
+
+}
+
+
+// delta modularity when an isolate vertex i moved into a cluster c 
+// c must be one of the clusters 
+// ptr version
+template<typename IdxType, typename ValType>
+__device__ void
+delta_modularity(const int n_vertex, const int c_size, bool updated,
+                 IdxType* csr_ptr_ptr, IdxType* csr_ind_ptr, ValType* csr_val_ptr, 
+                 IdxType* cluster_ptr,
+                 ValType c_sum, ValType m2,
+                 IdxType row_idx, IdxType col_idx, IdxType c, ValType* k_vec_ptr, ValType* score){
+ 
+  // ki: sum of i's edges weight 
+  // ki_in: sum of edge from i to c
+  // sum_tot: for all v in c, sum of v's edges weight
+ 
+  IdxType c_i = *(cluster_ptr + row_idx);
+  ValType ki_in = 0.0;
+  ki_in = (int)(c_i!=c)*(*(csr_val_ptr + col_idx));
+  ValType ki = *(k_vec_ptr + row_idx);
+  
+
+  if(!updated){
+    compute_k_i_in(n_vertex, csr_ptr_ptr, csr_ind_ptr, csr_val_ptr, cluster_ptr, c, row_idx, &ki_in);
+  }
+
+  ValType sum_tot = c_sum - (int)(c_i == c)*ki;
+  *score = ki_in - 2*sum_tot*ki/(m2);
+//  printf("i: %d\tci: %d\tc: %d\t2m: %1f\tkin: %f\tki: %f\tsum_tot: %f\tc_sum: %f\tdelta: %f\n", row_idx, c_i, c, m2, ki_in, ki, sum_tot, c_sum,*score );
+}
+
+
+
+template<typename IdxType=int, typename ValType>
+__device__ void compute_cluster_sum(const int n_vertex, const int c_size, 
+                                    IdxType* cluster_inv_ptr_ptr, IdxType* cluster_inv_ind_ptr, 
+                                    ValType* k_ptr, // pre-compute ki size: n_vertex
+                                    ValType* cluster_sum_vec){
+
+  int c = blockIdx.x * blockDim.x + threadIdx.x;
+  IdxType c_start, c_end;
+  ValType sum = 0.0;
+  if(c < c_size){
+    c_start = *(cluster_inv_ptr_ptr + c);
+    c_end = *(cluster_inv_ptr_ptr + c + 1);  
+
+#pragma unroll        
+    for(IdxType* it = cluster_inv_ind_ptr + c_start; it!= cluster_inv_ind_ptr + c_end ; ++it){
+      sum += (ValType)(*(k_ptr + *(it)));
+    }
+    *(cluster_sum_vec + c) = sum;
+    //printf("c: %d c_sum: %f\n", c, (ValType)(*(cluster_sum_vec + c)));
+  }
+   
+
+}
+
+
+template<typename IdxType=int, typename ValType>
+__global__ void
+kernel_compute_cluster_sum(const int n_vertex, const int c_size,
+                           IdxType* cluster_inv_ptr_ptr, IdxType* cluster_inv_ind_ptr,
+                           ValType* k_ptr, // pre-compute ki size: n_vertex
+                           ValType* cluster_sum_vec){
+
+  compute_cluster_sum(n_vertex, c_size, 
+                      cluster_inv_ptr_ptr, cluster_inv_ind_ptr, 
+                      k_ptr, cluster_sum_vec);
+  
+}
+
+
+/****************************************************************************************************
+*
+*  compute delta modularity vector, delta_modularity_vec, size = n_edges
+*  theads layout: (lunched as 1D)
+*    1 thread for 1 edge, flattened
+*    need coo row index instead (pre-computed)
+*  input variables:
+*    n_vertex: number of vertex
+*    n_edges:  number of edges
+*    c_size:   number of unique clusters
+*    updated:  if previous iteration generate a new supervertices graph    
+*    cluster_ptr: cluster assignment
+*    cluster_sum_vec_ptr: sum of clusters
+*    k_vec_ptr: ki vector 
+*  output:
+*    delta_modularity_vec: size = n_edges
+*                          delta modularity if we move from_node to to_nodes cluster c for each edge    
+*
+****************************************************************************************************/
+template<typename IdxType, typename ValType>
+__global__ void// __launch_bounds__(CUDA_MAX_KERNEL_THREADS) 
+build_delta_modularity_vec_flat(const int n_vertex, const int n_edges, const int c_size, ValType m2, bool updated,
+                          IdxType* coo_row_ind_ptr, IdxType* csr_ptr_ptr,  IdxType* csr_ind_ptr, ValType* csr_val_ptr, 
+                          IdxType* cluster_ptr,
+                          ValType* cluster_sum_vec_ptr,
+                          ValType* k_vec_ptr,
+                          ValType* delta_modularity_vec){
+
+  ValType m2_s(m2); //privatize 
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  
+  if( tid < n_edges ){
+    IdxType row_idx = *(coo_row_ind_ptr + tid);
+    IdxType col_idx = *(csr_ind_ptr + tid);
+    IdxType c = cluster_ptr[ col_idx ]; // target cluster c
+    ValType c_sum = cluster_sum_vec_ptr[c]; 
+
+    delta_modularity(n_vertex, c_size, updated,
+                     csr_ptr_ptr,  csr_ind_ptr, csr_val_ptr,
+                     cluster_ptr,
+                     c_sum, m2_s,
+                     row_idx, col_idx, c, k_vec_ptr, delta_modularity_vec + tid); 
+
+  }
+}
+
+
+/******************************************************************************************************
+*  NOT USED
+*  compute delta modularity vector, delta_modularity_vec, size = n_edges
+*  theads layout: (lauched as 2D)
+*    1 thread for 1 edge 
+*    each thread.x per vertex i
+*    each thread.y per neibor j of vertex i 
+*    need to pre compute max_degree for lauch this kernel
+*  input variables:
+*    n_vertex: number of vertex
+*    n_edges:  number of edges
+*    c_size:   number of unique clusters
+*    updated:  if previous iteration generate a new supervertices graph    
+*    cluster_ptr: cluster assignment
+*    cluster_sum_vec_ptr: sum of clusters
+*    k_vec_ptr: ki vector 
+*  output:
+*    delta_modularity_vec: size = n_edges
+*                          delta modularity if we move from_node to to_nodes cluster c for each edge
+*    
+*****************************************************************************************************/
+/*
+template<typename IdxIter, typename ValIter, typename ValType>
+__global__ void// __launch_bounds__(CUDA_MAX_KERNEL_THREADS) 
+build_delta_modularity_vec(const int n_vertex, const int c_size, ValType m2, bool updated,
+                          IdxIter csr_ptr_ptr, IdxIter csr_ind_ptr, ValIter csr_val_ptr, 
+                          IdxIter cluster_ptr,
+                          ValType* cluster_sum_vec_ptr,
+                          ValType* k_vec_ptr,
+                          ValType* delta_modularity_vec){
+
+  ValType m2_s(m2);
+
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+
+  int start, end;
+  if( i < n_vertex ){
+
+    start = *(csr_ptr_ptr + i);
+    end = *(csr_ptr_ptr + i + 1);
+    
+    if(j < end - start){
+      int j_idx = *(csr_ind_ptr + start + j);
+      int c = *( cluster_ptr + j_idx);
+      ValType c_sum = cluster_sum_vec_ptr[c];
+      
+      delta_modularity( n_vertex, c_size, updated,  
+                        csr_ptr_ptr, csr_ind_ptr, csr_val_ptr, 
+                        cluster_ptr, 
+                        c_sum, m2_s, 
+                        i, start + j, c, k_vec_ptr, delta_modularity_vec + start + j);
+
+    }
+  }
+}
+*/
+
+/******************************************************
+*
+*  find the max delta modularity for each vertex i
+*  zero out other delta modularity for vertex i
+*
+*******************************************************/
+//template<typename ValType, typename IdxIter, typename ValIter>
+template<typename ValType, typename IdxIter, typename ValIter>
+__global__ void// __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
+max_delta_modularity_vec_stride(const int n_vertex, const int n_edges,
+                         IdxIter csr_ptr_iter, IdxIter csr_ind_iter, ValIter csr_val_iter, IdxIter cluster_iter,
+                         ValType* delta_modularity_vec){
+  
+  unsigned int wid = blockIdx.x;  // 0 ~ n_vertex - 1
+  unsigned int tid = threadIdx.x; // 0 ~ 31
+
+  __shared__ int start_idx;
+  __shared__ int end_idx;
+  __shared__ int degree;
+  __shared__ ValType local_max[WARP_SIZE];
+  __shared__ ValType warp_max_val;
+  unsigned int stride = WARP_SIZE / 2;
+  warp_max_val = -1000;
+
+  if( wid < n_vertex ){ 
+    if(tid == 0){
+      start_idx = *(csr_ptr_iter + wid);     
+      end_idx = *(csr_ptr_iter + wid + 1);
+      degree = end_idx - start_idx;      
+    }
+    __syncwarp();
+    //find the max elements
+    for(unsigned xid = 0; xid + tid < ( degree ); xid += WARP_SIZE){
+      local_max[tid]= -1.0 ;
+
+      if(start_idx + xid + tid > n_edges) 
+        printf("Error access invalid memory %d = %d +  %d + %d end: %d\n", start_idx + xid + tid, start_idx, xid, tid, end_idx);
+
+      local_max[tid] = (ValType)(*(delta_modularity_vec + start_idx + xid + tid));
+
+      stride = umin(16, (degree)/2 + 1);
+     
+      while(tid < stride && stride > 0){
+        local_max[tid] = fmax(local_max[tid], local_max[tid + stride]);
+        
+        stride/=2;  //stride /=2
+      }
+      __syncwarp();
+
+      if(tid == 0 && warp_max_val < local_max[0]){
+        warp_max_val = local_max[0];
+      }
+    } 
+
+    __syncwarp();
+    // zero out non-max elements    
+    for(unsigned xid = 0; xid + tid < ( degree ); xid += WARP_SIZE){
+      if(start_idx + xid + tid < end_idx){
+        ValType original_val = ((ValType)*(delta_modularity_vec + start_idx + xid + tid));
+        (*(delta_modularity_vec + start_idx + xid + tid)) = (int)(original_val == warp_max_val) * original_val;
+
+/*
+        if(original_val == warp_max_val){
+          int j_idx =    (int)(*(csr_ind_iter + start_idx + xid + tid));
+          printf("+i: %d j: %d c: %d %f\n", wid, j_idx, (int)(*(cluster_iter + j_idx)),original_val );
+        }else{
+          int j_idx =    (int)(*(csr_ind_iter + start_idx + xid + tid));
+          printf("-i: %d j: %d c: %d %f\n", wid, j_idx, (int)(*(cluster_iter + j_idx)),original_val );
+          
+        }
+  */    
+
+      }
+    }
+    
+ 
+  }
+ 
+}
+
+
+/******************************************************
+*  NOT USED
+*  find the max delta modularity for each vertex i
+*  zero out other delta modularity for vertex i
+*
+*******************************************************/
+/*
+template<typename IdxIter, typename ValIter, typename ValType>
+__global__ void// __launch_bounds__(CUDA_MAX_KERNEL_THREADS) 
+max_delta_modularity_vec(const int n_vertex, 
+                          IdxIter csr_ptr_ptr, IdxIter csr_ind_ptr, ValIter csr_val_ptr, 
+                          ValType* delta_modularity_vec){
+
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int start, end;
+  ValType * best_pos_ptr;
+  if( i < n_vertex ){ 
+    start = *( csr_ptr_ptr + i);
+    end = *( csr_ptr_ptr + i + 1);
+    best_pos_ptr = thrust::max_element(thrust::cuda::par, delta_modularity_vec + start, delta_modularity_vec + end);
+  }
+
+  if( i < n_vertex ){
+    //printf("i: %d max: %f\n", i, (ValType)(*best_pos_ptr));
+    thrust::replace_if(thrust::cuda::par, delta_modularity_vec + start, delta_modularity_vec + end, not_best<ValType>(*best_pos_ptr), 0.0);
+ 
+  }
+
+}
+
+*/
+// Not used
+template<typename IdxType, typename ValType>
+void build_delta_modularity_vector_old(const int n_vertex, const int c_size, ValType m2, bool updated,
+                                       thrust::device_vector<IdxType>& csr_ptr_d, thrust::device_vector<IdxType>& csr_ind_d, thrust::device_vector<ValType>& csr_val_d, 
+                                       thrust::device_vector<IdxType>& cluster_d,
+                                       IdxType* cluster_inv_ptr_ptr, IdxType* cluster_inv_ind_ptr, // precompute cluster inverse
+                                       ValType* k_vec_ptr, // precompute ki's 
+                                       thrust::device_vector<ValType>& temp_vec, // temp global memory with size n_vertex
+                                       ValType* cluster_sum_vec_ptr, 
+                                       ValType* delta_Q_arr_ptr){
+
+  /* start compute delta modularity vec  */
+  dim3 block_size_1d((n_vertex + BLOCK_SIZE_1D -1)/ BLOCK_SIZE_1D, 1, 1);
+  dim3 grid_size_1d(BLOCK_SIZE_1D, 1, 1); 
+  int n_edges = csr_ptr_d[n_vertex];
+    
+  kernel_compute_cluster_sum<<<block_size_1d, grid_size_1d>>>( n_vertex, c_size, 
+                                                               cluster_inv_ptr_ptr, cluster_inv_ind_ptr,
+                                                               k_vec_ptr, cluster_sum_vec_ptr);
+  CUDA_CALL(cudaDeviceSynchronize());
+
+  thrust::fill(thrust::cuda::par, delta_Q_arr_ptr, delta_Q_arr_ptr + n_edges, 0.0);
+
+  //pre-compute max_degree for block_size_2D and grid_size_2D
+  thrust::transform(thrust::device, csr_ptr_d.begin() + 1, csr_ptr_d.end(), csr_ptr_d.begin(), temp_vec.begin(), minus_idx<IdxType, ValType>());    
+  auto max_ptr = thrust::max_element(thrust::device, temp_vec.begin(), temp_vec.begin() + n_vertex );
+  int max_degree = (IdxType)(*max_ptr);
+
+  dim3 block_size_2d((n_vertex + BLOCK_SIZE_2D*2 -1)/ (BLOCK_SIZE_2D*2), (max_degree + BLOCK_SIZE_2D -1)/ (BLOCK_SIZE_2D), 1);
+  dim3 grid_size_2d(BLOCK_SIZE_2D*2, BLOCK_SIZE_2D, 1);
+
+  // build delta modularity vec with 2D (vertex i, neighbor of i) grid size are_now(32, 16, 1)
+  build_delta_modularity_vec<<<block_size_2d, grid_size_2d>>>(n_vertex, c_size, m2, updated, 
+                                                              csr_ptr_d.begin(), csr_ind_d.begin(), csr_val_d.begin(),
+                                                              cluster_d.begin(),
+                                                              cluster_sum_vec_ptr,
+                                                              k_vec_ptr, delta_Q_arr_ptr);
+  CUDA_CALL(cudaDeviceSynchronize());
+
+    
+  block_size_1d = dim3((n_vertex + BLOCK_SIZE_1D*4 -1)/ BLOCK_SIZE_1D*4, 1, 1);
+  grid_size_1d = dim3(BLOCK_SIZE_1D*4, 1, 1); 
+
+  // zero out non maximum delta modularity for each vertex i grid size are now (128, 1, 1)
+  max_delta_modularity_vec<<<block_size_1d, grid_size_1d>>>(n_vertex, csr_ptr_d.begin(), csr_ind_d.begin(), csr_val_d.begin(), delta_Q_arr_ptr );
+  CUDA_CALL(cudaDeviceSynchronize());
+  
+}
+
+
+
+//
+// A new version of building delta modularity vector function
+//  
+//
+template<typename IdxType, typename ValType>
+void build_delta_modularity_vector(cusparseHandle_t cusp_handle, const int n_vertex, const int c_size, ValType m2, bool updated,
+                                   thrust::device_vector<IdxType>& csr_ptr_d, thrust::device_vector<IdxType>& csr_ind_d, thrust::device_vector<ValType>& csr_val_d, 
+                                   thrust::device_vector<IdxType>& cluster_d,
+                                   IdxType* cluster_inv_ptr_ptr, IdxType* cluster_inv_ind_ptr, // precompute cluster inverse
+                                   ValType* k_vec_ptr, // precompute ki's 
+                                   ValType* cluster_sum_vec_ptr, 
+                                   ValType* delta_Q_arr_ptr){
+
+  /* start compute delta modularity vec  */
+  dim3 block_size_1d((n_vertex + BLOCK_SIZE_1D -1)/ BLOCK_SIZE_1D, 1, 1);
+  dim3 grid_size_1d(BLOCK_SIZE_1D, 1, 1); 
+  int n_edges = csr_ptr_d[n_vertex];
+    
+  kernel_compute_cluster_sum<<<block_size_1d, grid_size_1d>>>( n_vertex, c_size, 
+                                                               cluster_inv_ptr_ptr, cluster_inv_ind_ptr,
+                                                               k_vec_ptr, cluster_sum_vec_ptr);
+  CUDA_CALL(cudaDeviceSynchronize());
+    
+  thrust::fill(thrust::cuda::par, delta_Q_arr_ptr, delta_Q_arr_ptr + n_edges, 0.0);
+  IdxType *csr_ptr_ptr = thrust::raw_pointer_cast(csr_ptr_d.data());
+  IdxType *csr_ind_ptr = thrust::raw_pointer_cast(csr_ind_d.data()); 
+  ValType *csr_val_ptr = thrust::raw_pointer_cast(csr_val_d.data());
+  IdxType *cluster_ptr = thrust::raw_pointer_cast(cluster_d.data());
+  
+  // pre compute coo row indices using cusparse
+  thrust::device_vector<IdxType> coo_row_ind(n_edges);
+  IdxType* coo_row_ind_ptr =  thrust::raw_pointer_cast(coo_row_ind.data());
+  cusparseXcsr2coo(cusp_handle, csr_ptr_ptr,  
+                   n_edges, n_vertex, coo_row_ind_ptr, 
+                   CUSPARSE_INDEX_BASE_ZERO);  
+  // build delta modularity vec flatten (1 thread per 1 edges) 
+  block_size_1d = dim3((n_edges + BLOCK_SIZE_1D * 2 -1)/ BLOCK_SIZE_1D * 2, 1, 1);
+  grid_size_1d  = dim3(BLOCK_SIZE_1D*2, 1, 1); 
+
+  build_delta_modularity_vec_flat<<<block_size_1d, grid_size_1d>>>(n_vertex, n_edges, c_size, m2, updated, 
+                                                                coo_row_ind_ptr, csr_ptr_ptr, csr_ind_ptr, csr_val_ptr,
+                                                                cluster_ptr,
+                                                                cluster_sum_vec_ptr,
+                                                                k_vec_ptr, delta_Q_arr_ptr);
+  CUDA_CALL(cudaDeviceSynchronize());
+
+ // Done compute delta modularity vec
+  block_size_1d = dim3(n_vertex, 1, 1);
+  grid_size_1d  = dim3(WARP_SIZE, 1, 1);
+ 
+  max_delta_modularity_vec_stride<<<block_size_1d, grid_size_1d>>>(n_vertex, n_edges, csr_ptr_d.begin(), csr_ind_d.begin(), csr_val_d.begin(), cluster_d.begin(), delta_Q_arr_ptr );
+  CUDA_CALL(cudaDeviceSynchronize());
+ 
+
+}
+
+
+
+} // nvlouvain
diff --git a/cpp/nvgraph/cpp/include/exclusive_kv_scan.hxx b/cpp/nvgraph/cpp/include/exclusive_kv_scan.hxx
new file mode 100644
index 00000000000..a180fbc4915
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/exclusive_kv_scan.hxx
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#pragma once
+#include "shfl.hxx"
+#include "sm_utils.h"
+
+namespace nvgraph
+{
+	//This file is to do a blockwide reduction by key as specialized for Key-Value Pairs.
+//Each thread will call this function. There will be two outputs. One will be the calling thread's
+//own output key value pair and the other will be the block-wide aggegrate reduction of the input items
+//This is based on Duane Merrills's Exclusive Scan function in Cub
+
+//Implementing key value pair to be called in device functions
+template<typename IndexType_, typename ValueType_> //allow for different datatypes
+struct KeyValuePair
+{
+	IndexType_ key;
+	ValueType_ value;
+};
+
+//binary reduction operator to be applied to the values- we can template on the type on 
+//the operator for the general case but only using sum () in our case so can simplify
+template<typename SemiRingType_>
+struct ReduceByKeySum
+{
+	SemiRingType_ SR;
+	__host__ __device__ __forceinline__ ReduceByKeySum(SemiRingType_ SR) : SR(SR) //pass in semiring 
+	{
+
+	}
+	template<typename IndexType_, typename ValueType_>
+	__host__ __device__ __forceinline__ KeyValuePair<IndexType_, ValueType_> 
+								operator() (const KeyValuePair<IndexType_, ValueType_> &first,
+											const KeyValuePair<IndexType_, ValueType_> &second)
+	{
+		KeyValuePair<IndexType_, ValueType_> result = second;
+		//check if they have matching keys and if so sum them
+		if (first.key == second.key)
+			result.value = SR.plus(first.value, result.value);
+		return result;
+	}
+};
+//Statically determien log2(N), rounded up
+template <int N, int CURRENT_VAL = N, int COUNT = 0>
+struct Log2
+{
+	/// Static logarithm value
+	enum { VALUE = Log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE }; // Inductive case
+};
+
+template <int N, int COUNT>
+struct Log2<N, 0, COUNT>
+{
+	enum {VALUE = (1 << (COUNT - 1) < N) ? // Base case
+	COUNT :
+	COUNT - 1 };
+};
+
+template<typename IndexType_, typename ValueType_, typename SemiRingType_, int BLOCK_DIM_X>
+struct PrefixSum
+{
+	int laneId, warpId, linearTid;
+	SemiRingType_ SR;
+	//list constants
+	enum
+	{
+		//number of threads per warp
+		WARP_THREADS = 32, 
+		// The number of warp scan steps log2
+		STEPS = Log2<WARP_THREADS>::VALUE,
+		// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+		SHFL_C = ((-1 << STEPS) & 31) << 8,
+		//add in more enums for the warps!
+		//calculate the thread block size in threads
+		BLOCK_DIM_Y = 1,
+		BLOCK_DIM_Z = 1,
+		BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+		//calculate the number of active warps
+		WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+	};
+	//constructor
+	__device__ __forceinline__ PrefixSum(SemiRingType_ SR) : SR(SR)
+	{
+		laneId = utils::lane_id(); //set lane id
+		linearTid = threadIdx.x; //simple for linear 1D block
+		warpId = (WARPS == 1) ? 0 : linearTid / WARP_THREADS;
+	}
+ 	
+ 	//Final function with the exclusive scan outputs one partial sum for the calling thread and the blockwide reduction
+	__device__ __forceinline__ void ExclusiveKeyValueScan(
+		KeyValuePair<IndexType_, ValueType_> &output, //input/output key value pair from the calling thread
+		KeyValuePair<IndexType_,ValueType_> &blockAggegrate) //blockwide reduction output
+	{
+		KeyValuePair<IndexType_, ValueType_> inclusiveOutput;
+		KeyValueScan(inclusiveOutput, output); //to get individual thread res
+		CalcBlockAggregate(output, inclusiveOutput, blockAggegrate, (laneId > 0)); //to get blockwide res
+	}
+
+	//This function uses the inclusive scan below to calculate the exclusive scan
+	__device__ __forceinline__ void KeyValueScan(
+		KeyValuePair<IndexType_,ValueType_> &inclusiveOutput, //calling thread's inclusive-scan output item
+		KeyValuePair<IndexType_,ValueType_> &exclusiveOutput) //calling thread's exclusive-scan output item
+	{	//exclusiveOutput is the initial input as well
+		InclusiveKeyValueScan(exclusiveOutput, inclusiveOutput); //inclusive starts at first number and last element is total reduction
+		//to get exclusive output shuffle the keys and values both up by 1
+		exclusiveOutput.key = utils::shfl_up(inclusiveOutput.key, 1);
+		exclusiveOutput.value = utils::shfl_up(inclusiveOutput.value, 1);
+	}
+
+	//This function computes an inclusive scan odf key value pairs
+	__device__ __forceinline__ void InclusiveKeyValueScan(
+		KeyValuePair<IndexType_, ValueType_> input, //calling thread's input item
+		KeyValuePair<IndexType_, ValueType_> &output //calling thread's input item
+		)
+	{
+		//__shfl_up and __ballot are intrinsic functions require SM30 or greater-send error message for lower hardwares
+		output = input;
+		IndexType_ predKey = utils::shfl_up(output.key, 1); //shuffle key to next neighbor
+		unsigned int ballot = utils::ballot((predKey != output.key));//intrinsic evaluates a condition for all threads in the warp and returns a 32-bit value 
+		//where each bit gives the condition for the corresponding thread in the warp.
+
+		//Mask away all lanes greater than ours
+		ballot = ballot & utils::lane_mask_le();
+
+		//Find index of first set bit
+		int firstLane = max(0, 31 - __clz(ballot));//Count the number of consecutive leading zero bits, 
+		//starting at the most significant bit (bit 31) of x. //Returns a value between 0 and 32 inclusive representing the number of zero bits. 
+		//Iterate scan steps
+		for (int step = 0; step < STEPS; ++step) //only called on double not key so not specific to key value pairs
+		{
+			output.value = SR.shflPlus(output.value, firstLane | SHFL_C, 1 << step); //plus defined on class operator
+			//if (threadIdx.x + blockDim.x *blockIdx.x < 4)printf("%.1f\n", output.value);
+		}
+	}
+
+	//This completes the warp-prefix scan.  Now we will use the Warp Aggregates to also calculate a blockwide aggregate
+	// Update the calling thread's partial reduction with the warp-wide aggregates from preceding warps.  
+	//Also returns block-wide aggregate
+    __device__ __forceinline__ void CalcBlockAggregate( //can add in scan operators later
+        KeyValuePair<IndexType_, ValueType_>   &partial,   //Calling thread's partial reduction
+        KeyValuePair<IndexType_, ValueType_>   warpAggregate,     //Warp-wide aggregate reduction of input items
+        KeyValuePair<IndexType_, ValueType_>   &blockAggregate,   //Threadblock-wide aggregate reduction of input items
+        bool            laneValid = true)  //Whether or not the partial belonging to the current thread is valid
+    {
+    	//use shared memory in the block approach
+        // Last lane in each warp shares its warp-aggregate
+        //use 1D linear linear_tid def
+        __shared__ KeyValuePair<IndexType_, ValueType_> warpAggregates[WARPS];
+        if (laneId == WARP_THREADS - 1) //number of threads per warp
+            warpAggregates[warpId] = warpAggregate; 
+        //load into shared memory and wait until all threads are done
+        __syncthreads();
+
+        blockAggregate = warpAggregates[0];
+        ReduceByKeySum<SemiRingType_> keyValAdd(SR); //call scn operator only add together if keys match
+        for (int warp = 1; warp < WARPS; ++warp)
+        {
+        	KeyValuePair<IndexType_, ValueType_> inclusive = keyValAdd(blockAggregate, partial);
+        	if (warpId == warp)
+            	partial = (laneValid) ? inclusive : blockAggregate;
+
+        	KeyValuePair<IndexType_, ValueType_> addend = warpAggregates[warp];
+        	blockAggregate = keyValAdd(blockAggregate, addend); //only add if matching keys
+        }
+    }
+};
+
+} //end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/include/functor.cuh b/cpp/nvgraph/cpp/include/functor.cuh
new file mode 100644
index 00000000000..a0e08425090
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/functor.cuh
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <thrust/random.h>
+
+
+namespace nvlouvain{
+
+template<typename IdxType, typename IdxIter>
+struct link_to_cluster{
+
+  IdxType key;
+  IdxIter cluster_iter;
+  __host__ __device__
+  link_to_cluster(IdxType _key, IdxIter _iter): key(_key), cluster_iter(_iter){}
+
+  __host__ __device__ 
+  bool operator()(const IdxType& csr_idx){ 
+    return ((*(cluster_iter + csr_idx)) == key);    
+  }
+};
+
+template<typename IdxType, typename IdxIter>
+struct link_inside_cluster{
+
+  IdxType idx_i;
+  IdxType key;
+  IdxIter cluster_iter;
+  __host__ __device__
+  link_inside_cluster(IdxType _idx_i, IdxType _key, IdxIter _iter):idx_i(_idx_i), key(_key), cluster_iter(_iter){}
+
+  __host__ __device__ 
+  bool operator()(const IdxType& csr_idx){ 
+    return ((*(cluster_iter + csr_idx)) == (*(cluster_iter + idx_i))) && ((*(cluster_iter + csr_idx)) == key);    
+  }
+};
+
+template<typename IdxType, typename IdxIter>
+struct link_incident_cluster{
+
+  IdxType key;
+  IdxIter cluster_iter;
+  IdxType i;
+  __host__ __device__
+  link_incident_cluster(IdxType _key, IdxIter _iter, IdxType _i): key(_key), cluster_iter(_iter), i(_i){}
+
+  __host__ __device__ 
+  bool operator()(const IdxType& csr_idx){ 
+    //if(csr_idx == i) return false; 
+    return (csr_idx == i) ? false : ((key) == (IdxType)(*(cluster_iter + csr_idx)) );    
+  }
+};
+
+template<typename IdxType, typename IdxIter>
+struct ci_not_equal_cj{
+
+  IdxType key;
+  IdxIter cluster_iter;
+  __host__ __device__
+  ci_not_equal_cj( IdxType _key, IdxIter _iter): key(_key), cluster_iter(_iter){}
+
+  __host__ __device__ 
+  bool operator()(const IdxType& idx){ 
+    IdxType cj = *(cluster_iter+idx);
+
+    return (cj != key);    
+  }
+};
+
+template<typename IdxType, typename IdxIter>
+struct ci_is_cj{
+
+  IdxType key;
+  IdxIter cluster_iter;
+  __host__ __device__
+  ci_is_cj( IdxType _key, IdxIter _iter): key(_key), cluster_iter(_iter){}
+
+  __host__ __device__ 
+  bool operator()(const IdxType& idx){ 
+    IdxType cj = *(cluster_iter+idx);
+    
+    return (cj == key);    
+  }
+};
+
+
+template<typename IdxType>
+struct rand_functor{
+  IdxType low;
+  IdxType up;
+
+  __host__ __device__
+  rand_functor(IdxType _low, IdxType _up): low(_low), up(_up){}
+
+  __host__ __device__
+  bool operator()(const IdxType& idx){
+    thrust::random::default_random_engine rand_eng;
+    thrust::random::uniform_int_distribution< IdxType > random_op(low, up);
+    rand_eng.discard(idx);
+    return random_op(rand_eng);
+    
+  }
+};
+
+template<typename IdxType>
+struct not_zero{
+  __host__ __device__
+  bool operator()(const IdxType& idx){
+    return (idx != 0);
+    
+  }
+};
+
+template<typename IdxType>
+struct is_one{
+  __host__ __device__
+  bool operator()(const IdxType& x){
+    return x == 1;
+  }
+};
+
+template<typename IdxType>
+struct is_c{
+  IdxType c;
+  __host__ __device__
+  is_c(int _c):c(_c){}
+
+  __host__ __device__
+  bool operator()(const IdxType& x){
+    return x == c;
+  }
+};
+
+
+template<typename ValType>
+struct not_best{
+  ValType best_val;
+  __host__ __device__
+  not_best(ValType _b):best_val(_b){}
+  __host__ __device__ 
+  bool operator()(const ValType& val){ 
+    return (val != best_val);    
+  }
+};
+
+template<typename ValType>
+struct assign_k_functor{
+    ValType* k_ptr;
+    __host__ __device__
+    assign_k_functor(ValType* _k):k_ptr(_k){}
+
+    template <typename Tuple>
+    __host__ __device__
+    void operator()(Tuple t){
+        //output[i] = k_ptr[ ind[i] ];
+        thrust::get<1>(t) = *(k_ptr + thrust::get<0>(t));
+       // t.first = *(k_ptr + t.second);
+    }
+};
+
+template<typename IdxType, typename IdxIter>
+struct assign_table_functor{
+    IdxType* table_array;
+    IdxIter cluster_iter;
+    __host__ __device__
+    assign_table_functor(IdxIter _c, IdxType* _t):cluster_iter(_c),table_array(_t){}
+
+    template <typename Tuple>
+    __host__ __device__
+    void operator()(Tuple t){
+        //output[i] = k_ptr[ ind[i] ];
+//        thrust::get<1>(t) = *(k_ptr + thrust::get<0>(t));
+        table_array[*(cluster_iter + thrust::get<0>(t))] = 1;
+       // t.first = *(k_ptr + t.second);
+    }
+};
+
+
+template<typename IdxType, typename ValType>
+struct minus_idx{
+
+    __host__ __device__
+    ValType operator()(const IdxType & x, const IdxType & y) const{
+      return (ValType) (x - y);
+    }
+};
+
+template<typename IdxType, typename IdxIter>
+struct sort_by_cluster{
+    IdxIter cluster_iter;
+    __host__ __device__
+    sort_by_cluster(IdxIter _c):cluster_iter(_c){}
+
+    __host__ __device__
+    bool operator()(const IdxType& a, const IdxType& b){   
+      return (IdxType)(*(cluster_iter + a)) < (IdxType)(*(cluster_iter + b));
+    }
+
+};
+
+
+template<typename IdxType>
+__device__ inline IdxType not_delta_function(IdxType c1, IdxType c2){
+  return (IdxType)(c1!=c2);
+}
+
+
+template<typename IdxType>
+__device__ inline IdxType delta_function(IdxType c1, IdxType c2){
+  return (IdxType)(c1==c2);
+}
+
+
+}// nvlouvain
diff --git a/cpp/nvgraph/cpp/include/graph.hxx b/cpp/nvgraph/cpp/include/graph.hxx
new file mode 100644
index 00000000000..0406740ff2c
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/graph.hxx
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdlib>
+#include <cstddef> // size_t
+#include <iostream> 
+
+#include <graph_visitors.hxx>//
+//
+namespace nvgraph
+{
+
+#define DEFINE_VISITABLE(T) \
+virtual void Accept(VisitorBase& guest) \
+{ BaseVisitableGraph<T>::AcceptImpl(*this, guest); }
+
+template<typename T>
+struct BaseVisitableGraph
+{
+  virtual void Accept(VisitorBase& v) = 0;
+
+  virtual ~BaseVisitableGraph(void)
+  {
+  }
+protected:
+  template<typename Host>
+  static void AcceptImpl(Host& visited, VisitorBase& guest)
+  {
+	if( Visitor<Host>* p = dynamic_cast<Visitor<Host>*>(&guest))
+	  {
+		p->Visit(visited);
+	  }
+  }
+};
+
+template<typename IndexType_>
+class Graph: public BaseVisitableGraph<IndexType_>
+{
+public:
+    typedef IndexType_ IndexType;
+    
+protected:
+    size_t num_vertices;
+    size_t num_edges;
+    Graph<IndexType> *parent;
+    Graph<IndexType> *child;
+
+public:
+    /*! Construct an empty \p Graph.
+     */
+    Graph()
+        : num_vertices(0),num_edges(0) {}
+
+    /*! Construct a \p Graph with a specific number of vertices.
+     *
+     *  \param vertices Number of vertices.
+     */
+    Graph(size_t vertices)
+        : num_vertices(vertices), num_edges(0) {}
+
+    /*! Construct a \p Graph with a specific number of vertices and edges.
+     *
+     *  \param vertices Number of vertices.
+     *  \param edges Number of edges.
+     */
+    Graph(size_t vertices, size_t edges)
+        : num_vertices(vertices), num_edges(edges) {}
+
+    /*! Construct a \p CsrGraph from another graph.
+     *
+     *  \param CsrGraph Another graph in csr
+     */
+    Graph(const Graph& gr)
+    {
+        num_vertices = gr.get_num_vertices();
+        num_edges = gr.get_num_edges();
+    }
+
+    inline void set_num_vertices(IndexType_ p_num_vertices) { num_vertices = p_num_vertices; }
+    inline void set_num_edges(IndexType_ p_num_edges) { num_edges = p_num_edges; }
+    inline size_t get_num_vertices() const { return num_vertices; }
+    inline size_t get_num_edges() const { return num_edges; }
+    /*! Resize graph dimensions
+     *
+     *  \param num_rows Number of vertices.
+     *  \param num_cols Number of edges.
+     */
+   //inline void resize(size_t vertices, size_t edges)
+   //{
+   //    num_vertices = vertices;
+   //    num_edges = edges;
+   //}
+
+    //Accept method injection
+    DEFINE_VISITABLE(IndexType_)
+};
+
+} // end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/include/graph_concrete_visitors.hxx b/cpp/nvgraph/cpp/include/graph_concrete_visitors.hxx
new file mode 100644
index 00000000000..279a21ed61e
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/graph_concrete_visitors.hxx
@@ -0,0 +1,1414 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GRAPH_CONCRETE_VISITORS_HXX
+#define GRAPH_CONCRETE_VISITORS_HXX
+
+#include <multi_valued_csr_graph.hxx> //which includes all other headers... 
+#include <range_view.hxx> // TODO: to be changed to thrust/range_view.h, when toolkit gets in sync with Thrust
+#include <thrust_traits.hxx>
+#include <cassert>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/transform.h>
+#include <thrust/sequence.h>
+#include <thrust/binary_search.h>
+#include <thrust/functional.h>
+#include <thrust/copy.h>
+#include <thrust/reduce.h>
+#include <thrust/gather.h>
+#include <thrust/scan.h>
+#include <thrust/fill.h>
+#include <thrust/remove.h>
+#include <thrust/count.h>
+#include <thrust/distance.h>//
+#include <thrust/unique.h>//
+#include <thrust/merge.h>//
+#include <thrust/sort.h>//
+#include <thrust/find.h>//
+#include <iostream>
+#include <sstream>
+#include <iterator>
+#include <algorithm>
+
+namespace nvgraph
+{
+  //get unique elements and return their count:
+  //
+  template<typename Container>
+  size_t count_get_distinct(const Container& v, //in
+			    Container& res)     //out
+  {
+    res.assign(v.begin(), v.end());//copy
+  
+    size_t counts = thrust::distance(res.begin(), thrust::unique(res.begin(), res.end()));
+    res.resize(counts);
+    return  counts;
+  }
+
+  //Adapted from: https://github.com/thrust/thrust/blob/master/examples/expand.cu
+  //
+  //Note:
+  //C++03 doesn’t allow default template arguments on function templates. 
+  //This was considered a “defect” by Bjarne Stroustrup, subsequently fixed in C++11. 
+  //See, for example: http://stackoverflow.com/questions/2447458/default-template-arguments-for-function-templates 
+  //
+  template<typename T, 
+	   template<typename> class Allocator, 
+	   template<typename, typename> class Vector>
+  typename Vector<T, Allocator<T> >::iterator expand(Vector<T, Allocator<T> >& counts,
+													 Vector<T, Allocator<T> >& values,
+													 Vector<T, Allocator<T> >& out)
+  {
+    typedef typename Vector<T, Allocator<T> >::iterator Iterator;
+
+    Iterator first1 = counts.begin();
+    Iterator last1 =  counts.end();
+  
+    Iterator first2 = values.begin();
+    Iterator output = out.begin();
+
+    typedef typename thrust::iterator_difference<Iterator>::type difference_type;
+  
+    difference_type input_size  = thrust::distance(first1, last1);
+    difference_type output_size = thrust::reduce(first1, last1);
+
+    // scan the counts to obtain output offsets for each input element
+    Vector<difference_type, Allocator<difference_type> > output_offsets(input_size, 0);
+    thrust::exclusive_scan(first1, last1, output_offsets.begin()); 
+
+    // scatter the nonzero counts into their corresponding output positions
+    Vector<difference_type, Allocator<difference_type> > output_indices(output_size, 0);
+    thrust::scatter_if
+      (thrust::counting_iterator<difference_type>(0),
+       thrust::counting_iterator<difference_type>(input_size),
+       output_offsets.begin(),
+       first1,
+       output_indices.begin());
+
+    // compute max-scan over the output indices, filling in the holes
+    thrust::inclusive_scan
+      (output_indices.begin(),
+       output_indices.end(),
+       output_indices.begin(),
+       thrust::maximum<difference_type>());
+
+    // gather input values according to index array (output = first2[output_indices])
+    Iterator output_end = output; thrust::advance(output_end, output_size);
+    thrust::gather(output_indices.begin(),
+		   output_indices.end(),
+		   first2,
+		   output);
+
+    // return output + output_size
+    thrust::advance(output, output_size);
+    return output;
+  }
+
+
+
+  //
+  //
+
+
+  
+  //##### Change 1: reverse hash was wrong: hash[val_i] = index of first occurence of val_i #####
+  //
+  template<typename Container>
+  struct MinLeftRightPlusValue
+  {
+    typedef typename VectorPtrT<typename Container::value_type,Container>::PtrT PtrT;
+    typedef typename Container::value_type ValT;
+
+    explicit MinLeftRightPlusValue(ValT delta):
+      delta_(delta)
+    {
+    }
+    
+    __host__ __device__
+    ValT operator() (ValT left, ValT right)
+    {
+      ValT rs = right + delta_;
+      return (left < rs? left : rs);
+    }
+
+  private:
+    ValT delta_;
+  };
+
+  //given vector v[i] = val_i, 
+  //return reverse hash vector:
+  //hash[val_i] = i (index of first occurence of val_i, if val_i exists in v[]; 
+  //                 else, last occurence of closest value less than val_i):
+  //
+  //advantage:    works trully like a hash, no need for search
+  //
+  // 
+  //pre-conditions: (1.) v sorted in ascending order
+  //                (2.) value_type is integer type 
+  //
+  //Ex:
+  //v:    0,1,3,6,7,8,8;
+  //hash: 0,1,1,2,2,2,3,4,5;
+  //
+  template<typename Container>
+  void reverse_hash(Container& v,    //in
+		    Container& hash) //out
+  {
+    typedef typename Container::value_type ValT;
+
+    if( v.empty() )
+      return;
+
+    size_t sz = v.size();
+    size_t seq_sz = v.back()-v.front()+1;
+
+    thrust::counting_iterator<ValT> seq_first(v.front());
+    thrust::counting_iterator<ValT> seq_last(v.back()+1);
+
+    Container hash1(seq_sz, ValT(-1));
+    Container hash2(seq_sz, ValT(-1));
+    hash.assign(seq_sz, ValT(-1));
+
+    thrust::upper_bound(v.begin(), v.end(),
+			seq_first, seq_last, //seq.begin(), seq.end(),//ok 
+			hash1.begin(),
+			thrust::less<ValT>());
+
+    //
+    thrust::lower_bound(v.begin(), v.end(),
+			seq_first, seq_last, //seq.begin(), seq.end(), //ok
+			hash2.begin(),
+			thrust::less<ValT>());
+
+    thrust::transform(hash2.begin(), hash2.end(),
+		      hash1.begin(),
+		      hash.begin(),
+		      MinLeftRightPlusValue<Container>(-1));
+    
+  }
+
+  //better use thrust::gather(...)
+  //see /home/aschaffer/Development/Sources/Cuda_Thrust/filter_via_gather.cu
+  template<typename VectorR, typename IndexT>
+  struct Filter
+  {
+    typedef typename VectorR::value_type RetT;
+
+    explicit Filter(VectorR& src):
+      m_src(&src[0])
+    {
+    }
+    __host__ __device__
+    RetT operator()(const IndexT& k)
+    {
+      return m_src[k];
+    }
+  private:
+    typename VectorPtrT<typename VectorR::value_type,VectorR>::PtrT m_src;
+  };
+
+  template<typename Container, typename IndexT>
+  struct CleanFctr
+  {
+    explicit CleanFctr(Container& used):
+      m_used(&used[0])
+    {
+    }
+    __host__ __device__
+    bool operator()(const IndexT& k)
+    {
+      return (m_used[k] == 0);
+    }
+  private:
+    typename VectorPtrT<typename Container::value_type,Container>::PtrT m_used;
+  };
+
+  //
+  //
+  template<typename VectorV, 
+	   typename VectorI>
+  struct ValueUpdater
+  {
+    typedef typename VectorI::value_type IndexT;
+    //typedef typename VectorPtrT<typename VectorI::value_type,VectorV>::PtrT PtrI;
+
+    typedef typename VectorV::value_type ValueT;
+    typedef typename VectorPtrT<typename VectorV::value_type,VectorV>::PtrT PtrV;
+
+    explicit ValueUpdater(VectorV& v_src,
+			  VectorV& v_dest):
+      v_s_(v_src),
+      v_d_(v_dest)
+    {
+    }
+  
+    ///__host__ __device__
+    ValueT at(IndexT j) const
+    {
+      return v_s_[j];
+    }
+
+    struct ValFiller
+    {
+      explicit ValFiller(VectorV& v_src):
+	m_s(&v_src[0])
+      {
+      }
+
+      __host__ __device__
+      ValueT operator() (IndexT k)
+      {
+	return m_s[k];
+      }
+    private:
+      PtrV m_s;
+    };
+
+    //#####  Change 5: const K #####
+    //
+    void update_from(const VectorI& K)
+    {
+      size_t actual_nnz = K.size();
+
+      v_d_.assign(actual_nnz, ValueT(0));
+
+      ValFiller valfill(v_s_);
+      thrust::transform(K.begin(), K.end(),
+			v_d_.begin(),
+			valfill);
+    }
+      
+    const VectorV& get_subg_vals(void) const
+    {
+      return v_d_;
+    }
+  private:
+    VectorV& v_s_;
+    VectorV& v_d_;
+  };
+
+  template<typename VectorI,
+	   typename VectorB = VectorI>
+  struct Offsets2RowIndex
+  {
+
+    typedef typename VectorI::value_type IndexT;
+    typedef typename VectorPtrT<typename VectorB::value_type,VectorB>::PtrT PtrB;
+    typedef typename VectorPtrT<typename VectorI::value_type,VectorI>::PtrT PtrI;
+
+
+    Offsets2RowIndex(VectorB& hash_rows,
+		     VectorI& offsets,
+		     VectorI& I0,
+		     VectorI& vSub,
+		     VectorI& row_ptr,
+		     VectorI& col_ind,
+		     VectorI& I,
+		     VectorI& J,
+		     VectorI& K,
+		     VectorB& U):
+      m_hash_sz(hash_rows.size()),
+      m_off_sz(offsets.size()),
+      m_hash_rows(&hash_rows[0]),
+      m_offsets(&offsets[0]),
+      m_i0(&I0[0]),
+      m_row_subset(&vSub[0]),
+      m_row_ptr(&row_ptr[0]),
+      m_col_ind(&col_ind[0]),
+      m_i(&I[0]),
+      m_j(&J[0]),
+      m_k(&K[0]),
+      m_used(&U[0])
+    {	  
+    }
+
+	
+            
+    //k = element in range[]:{0,1,...,nnz-1}
+    //
+    __host__ __device__
+    IndexT operator() (IndexT k)
+    {
+      IndexT subg_row_index = m_i0[k];
+	
+      IndexT g_row_index = m_row_subset[subg_row_index];
+
+      //j = col_ind[ row_ptr[g_row_index] + k - offsets[subg_row_index]]
+      //
+      IndexT row_ptr_i = m_row_ptr[g_row_index]+
+	k-
+	m_offsets[subg_row_index];
+
+      IndexT col_index = m_col_ind[row_ptr_i];
+
+      //is col_index in row_subset?
+      //
+      if( (col_index < m_hash_sz) && (m_hash_rows[col_index] == 1) )
+	//col_index in subset, too=>it's a hit!
+	{
+	  m_i[k] = g_row_index;
+	  m_j[k] = col_index;
+
+	  ///m_v[k] = m_fctr.at(row_ptr_i);//ok, but couples it with vals...
+	  m_k[k] = row_ptr_i;
+
+	  m_used[k] = 1;
+	}
+      //else ...nothing
+	  
+      return g_row_index;
+    }
+  private:
+    const size_t m_hash_sz;
+    const size_t m_off_sz;
+
+    PtrB m_hash_rows;
+
+    PtrI m_offsets;
+
+    PtrI m_offset_indices;
+
+    PtrI m_row_subset;
+
+    PtrI m_row_ptr;
+
+    PtrI m_col_ind;
+
+    PtrI m_i0;
+
+    PtrI m_i;
+
+    PtrI m_j;
+
+    PtrI m_k;
+
+    PtrB m_used;
+  };
+
+  template<typename VectorI,
+	   typename VectorB>
+  size_t fill_hash_nz2ijv(VectorB& hash_rows,
+			  VectorI& range,         //in/out
+			  VectorI& nzs,
+			  VectorI& offsets,
+			  VectorI& vSub,
+			  VectorI& row_ptr,
+			  VectorI& col_ind,
+			  VectorI& I,
+			  VectorI& J,
+			  VectorI& K,
+			  VectorB& U)
+  {
+    typedef typename VectorI::value_type IndexT;
+
+    size_t nnz = range.size();
+    size_t nrows_subg = nzs.size();
+
+    VectorI I0(nnz, IndexT(0));
+    VectorI dummy(nnz, IndexT(0));
+
+    //make m_offset_indices increasing sequence
+    //from 0,...,offsets.size()-1
+    //
+    VectorI offset_indices(nrows_subg, IndexT(0));
+    thrust::sequence(offset_indices.begin(),
+		     offset_indices.end(),
+		     IndexT(0));
+
+    expand(nzs, offset_indices, I0);
+
+    Offsets2RowIndex<VectorI, /*VectorV, ValueUpdater, VectorSz,*/ VectorB > 
+      off_fctr(hash_rows, 
+	       offsets, 
+	       I0,
+	       vSub, 
+	       row_ptr, 
+	       col_ind, 
+	       I,J,K,U);
+
+    //why unused dummy? 
+    //because functor must return something  
+    //and must store result of functor somewhere!
+    //
+    thrust::transform(range.begin(), range.end(), 
+		      dummy.begin(), //unused...
+		      off_fctr);
+
+    CleanFctr<VectorB, IndexT> cleaner(U);
+    range.erase(thrust::remove_if(range.begin(), range.end(), cleaner), range.end());
+
+    size_t actual_nnz = range.size();
+
+    VectorI truncated_i(actual_nnz, IndexT(0));
+    VectorI truncated_j(actual_nnz, IndexT(0));
+    ///VectorV truncated_v(actual_nnz, IndexT(0));
+    VectorI truncated_k(actual_nnz, IndexT(0));
+
+    Filter<VectorI, IndexT> filter_i(I);
+    thrust::transform(range.begin(), range.end(), 
+		      truncated_i.begin(),
+		      filter_i);
+    I = truncated_i; // vector copy!
+
+    Filter<VectorI, IndexT> filter_j(J);
+    thrust::transform(range.begin(), range.end(), 
+		      truncated_j.begin(),
+		      filter_j);
+    J = truncated_j; // vector copy!
+
+    Filter<VectorI, IndexT> filter_k(K);
+    thrust::transform(range.begin(), range.end(), 
+		      truncated_k.begin(),
+		      filter_k);
+    K = truncated_k; // vector copy!
+
+    // Filter<VectorV, IndexT> filter_v(V);
+    // thrust::transform(range.begin(), range.end(), 
+    // 					truncated_v.begin(),
+    // 					filter_v);
+    // V = truncated_v; // vector copy!
+      
+    //scoo.m_v[] == subg.vals !
+    ///fctr.update_vals(scoo.get_v());
+
+    U.assign(actual_nnz,1);//just for consistency, 
+    //                       not really necessary
+
+    return actual_nnz;
+  }
+
+
+  template<typename Container>
+  struct NzCounter
+  {
+    typedef typename Container::value_type IndexT;
+    typedef typename VectorPtrT<typename Container::value_type,Container>::PtrT PtrT;
+
+    explicit NzCounter(Container& row_ptr):
+      m_row_ptr(&row_ptr[0])
+    {
+    }
+            
+    __host__ __device__
+    IndexT operator() (const IndexT& i)
+    {
+      return m_row_ptr[i+1]-m_row_ptr[i];
+    }
+  private:
+    PtrT m_row_ptr;
+  };
+
+  template<typename Container>
+  struct HashFctr
+  {
+    typedef typename Container::value_type IndexT;
+
+    explicit HashFctr(Container& hash_src):
+      m_hash(&hash_src[0])
+    {
+    }
+    __host__ __device__
+    IndexT operator() (const IndexT& src_elem)
+    {
+      IndexT hit(1);
+      m_hash[src_elem] = hit;
+      return hit;
+    }
+  private:
+    typename VectorPtrT<typename Container::value_type,Container>::PtrT m_hash;
+  };
+
+  template<typename VectorI,
+	   typename VectorB>
+  size_t make_hash(VectorI& src,
+		   VectorB& hash_src,
+		   bool is_sorted = false)
+  {
+    typedef typename VectorI::value_type IndexT;
+    typedef typename VectorB::value_type ValueB;
+
+    assert( !src.empty() );
+        
+    IndexT max_entry(0);
+    if( is_sorted )
+      max_entry = src.back();
+    else
+      max_entry = thrust::reduce(src.begin(), src.end(), 
+				 0, 
+				 thrust::maximum<IndexT>());
+        
+    hash_src.assign(max_entry+1, 0);
+    VectorB dummy(hash_src);	
+        
+    HashFctr<VectorB> hctr(hash_src);
+
+    //why unused dummy? 
+    //because functor must return something  
+    //and must store result of functor somewhere!
+    //
+    thrust::transform(src.begin(), src.end(), 
+		      dummy.begin(), //unused...
+		      hctr);
+
+    return hash_src.size();
+  }
+
+
+  //#####  Change 2: subg row_ptr extraction failed on missing indices #####
+  //
+  template<typename VectorI>
+  void make_subg_row_ptr(size_t actual_nnz,     //in: # non-zeros in subgraph matrix
+			 size_t nrows,          //in: |vSub|
+			 VectorI& source,    //in: array of row indices where there 
+			 //    are non-zeros (assumed sorted)
+			 VectorI& subg_row_ptr) //out:subgraph row_ptr
+  {
+    typedef typename VectorI::value_type IndexT;
+
+    if( actual_nnz == 0 )
+      return;//nothing to do...
+  
+    IndexT start = source.front();//cannot call this on an empty source[]
+
+    VectorI counts(nrows, 0);
+
+    VectorI ub(nrows), lb(nrows);
+    thrust::upper_bound(source.begin(), source.end(),
+			thrust::make_counting_iterator(static_cast<size_t>(start)),
+			thrust::make_counting_iterator(nrows+static_cast<size_t>(start)),
+			ub.begin());
+
+    thrust::lower_bound(source.begin(), source.end(),
+			thrust::make_counting_iterator(static_cast<size_t>(start)),
+			thrust::make_counting_iterator(nrows+static_cast<size_t>(start)),
+			lb.begin());
+
+    thrust::transform(ub.begin(), ub.end(), lb.begin(), counts.begin(), thrust::minus<int>());
+
+    thrust::exclusive_scan(counts.begin(), counts.end(),
+			   subg_row_ptr.begin());
+
+    subg_row_ptr.back() = actual_nnz;
+  }
+
+  //used by renumber_indices(...)
+  //
+  template<typename Container>
+  struct Hasher
+  {
+    typedef typename Container::value_type IndexT;
+    typedef typename VectorPtrT<typename Container::value_type,Container>::PtrT PtrT;
+
+    explicit Hasher(Container& hash_src):
+      m_hash(&hash_src[0])
+    {
+    }
+    __host__ __device__
+    IndexT operator() (IndexT i, IndexT v)
+    {
+      m_hash[v] = i;
+      return v;
+    }
+
+    __host__ __device__
+    IndexT operator() (IndexT u)
+    {
+      return m_hash[u];
+    }
+  private:
+    PtrT m_hash;
+  };
+
+  //#####  Change 3: index renumbering must be split into hash construction and hash usage #####
+  //constructs hash table
+  //from set of indices into reduced set of indices:
+  //row_idx{5,7,10,12}->{0,1,2,3};
+  // so that given u{12,7} you get: w{3,1}
+  //w[i]=hash[u[i]]; 
+  //
+  //Pre-conditions:
+  //(1.) row_idx is sorted (increasing order);
+  //(2.) row_idx has no duplicates;
+  //
+  template<typename VectorI>
+  void renumber_indices(VectorI& row_idx, //in: subset of row indices; 
+			//    pre-conditions=
+			//    {sorted (increasingly), no duplicates} 
+			VectorI& hash_t)  //out: renumbering hash table
+  {
+    typedef typename VectorI::value_type IndexT;
+    size_t n = row_idx.size();
+    VectorI dummy(n,IndexT(0));
+
+    IndexT max_entry = row_idx.back();//...since row_idx is sorted increasingly 
+    hash_t.assign(max_entry+1, -1);
+
+    Hasher<VectorI> hasher(hash_t);
+
+    thrust::counting_iterator<IndexT> first(0);
+
+    thrust::transform(first, first+n, 
+		      row_idx.begin(), 
+		      dummy.begin(),
+		      hasher);
+  }
+
+  template<typename VectorI>
+  void get_renumbered_indices(VectorI& u,       //in: in=subset of row_idx; 
+			      VectorI& hash_t,  //in: renumbering hash table
+			      VectorI& w)       //out:renumbered: hash[u[i]]
+  {
+    typedef typename VectorI::value_type IndexT;
+
+    Hasher<VectorI> hasher(hash_t);
+
+    thrust::transform(u.begin(), u.end(),
+		      w.begin(),
+		      hasher);
+  }
+
+  template<typename VectorI,
+	   typename VectorV,
+	   typename VectorB = VectorI>
+  struct SubGraphExtractorFunctor
+  {
+    typedef typename VectorI::value_type IndexT;
+    typedef typename VectorV::value_type ValueT;
+    typedef typename VectorB::value_type ValueB;
+
+    typedef typename VectorPtrT<typename VectorB::value_type,VectorB>::PtrT PtrB;
+    typedef typename VectorPtrT<typename VectorI::value_type,VectorI>::PtrT PtrI;
+    typedef typename VectorPtrT<typename VectorV::value_type,VectorV>::PtrT PtrV;
+  
+    //constructor for edge subset:
+    //requires additional info: col_ind, row_ptr
+    //
+    //pre-conditions: (1.) eSub sorted in ascending order;
+    //                (2.) eSub has no duplicates;
+    //
+    SubGraphExtractorFunctor(const VectorI& eSub, bool /*unused*/):
+      edgeSubset(eSub),
+      is_vertex_extraction(false)
+    {
+    }
+
+    explicit SubGraphExtractorFunctor(const VectorI& vSubset):
+      vertexSubset(vSubset),
+      is_vertex_extraction(true)
+    {
+      //make sure vertexSubset_ is sorted increasingly:
+      ///sort_ifnot(vertexSubset);
+
+      row_ptr_subg.assign(vSubset.size()+1, IndexT(0)); // can be pre-allocated
+    }
+   
+    
+    virtual ~SubGraphExtractorFunctor(void)
+    {
+    }
+    
+    const VectorV& get_vals(void) const
+    {
+      return vals_subg;
+    }
+
+    VectorV& get_vals(void)
+    {
+      return vals_subg;
+    }
+    
+    const VectorI& get_row_ptr(void) const
+    {
+      return row_ptr_subg;
+    }
+    
+    const VectorI& get_col_ind(void) const
+    {
+      return col_ind_subg;
+    }
+    
+    struct NoValueUpdater
+    {
+      //#####  Change 5: const K #####
+      //
+      void update_from(const VectorI& K)
+      {
+	//no-op....
+      }
+    };
+
+    virtual void operator () (VectorI& row_ptr_,
+			      VectorI& col_ind_)
+    {
+      NoValueUpdater no_op;
+      if( is_vertex_extraction )
+	extract_subgraph_by_vertex(row_ptr_, col_ind_, no_op);
+      else
+	extract_subgraph_by_edge(row_ptr_, col_ind_, no_op);
+    }
+
+    
+    virtual void operator () (VectorV& vals_,
+			      VectorI& row_ptr_,
+			      VectorI& col_ind_)
+    {
+      ValueUpdater<VectorV, VectorI> fctrv(vals_, vals_subg);
+      if( is_vertex_extraction )
+	extract_subgraph_by_vertex(row_ptr_, col_ind_, fctrv);
+      else
+	extract_subgraph_by_edge(row_ptr_, col_ind_, fctrv);
+    }
+
+    IndexT get_subg_nnz(void) const
+    {
+      return row_ptr_subg.back();
+    }
+
+    const VectorI& get_I(void) const
+    {
+      return I;
+    }
+
+    const VectorI& get_J(void) const
+    {
+      return J;
+    }
+
+    const VectorI& get_K(void) const
+    {
+      return K;
+    }
+
+
+    const VectorI& get_hash_table(void) const
+    {
+      return hash_t;
+    }
+
+    const VectorI& get_vertex_subset(void) const
+    {
+      return vertexSubset;
+    }
+
+
+  protected:
+    
+    template<typename ValUpdaterFctr>
+    void extract_subgraph_by_vertex(VectorI& row_ptr_,
+				    VectorI& col_ind_,
+				    ValUpdaterFctr fctrv)
+    {
+      typedef typename VectorI::value_type IndexT;
+      //typedef typename VectorV::value_type ValueT;
+      typedef typename VectorB::value_type ValueB;
+
+      if( vertexSubset.empty() )
+	return; //nothing to do
+
+      //Pre-condition (new): vertexSubset sorted!
+      size_t nrows_subg = vertexSubset.size();
+        
+      //step 1: subgraph *upper-bound* 
+      //of #non-zeros per row:
+      VectorI nzs(nrows_subg, 0);
+      //count_nz_per_row(row_ptr_, vertexSubset, nzs);
+      NzCounter<VectorI> count_nzs(row_ptr_);
+      thrust::transform(vertexSubset.begin(), vertexSubset.end(), 
+			nzs.begin(), 
+			count_nzs);
+        
+      //step 2: offsets of where each
+      //subgraph row *could* have entries;
+      //
+      //TODO: change to an exclusive prefix scan!
+      //
+      VectorI offsets(nrows_subg, 0);
+      thrust::exclusive_scan(nzs.begin(), nzs.end(),
+			     offsets.begin());
+        
+      //step 3: total # non-zero entries; this is used as upper bound
+      //for # non-zero entries of subgraph;
+      //
+      size_t nnz = offsets.back()+nzs.back();
+
+      VectorI range(nnz, IndexT(0));//increasing sequence
+      thrust::sequence(range.begin(), range.end(),IndexT(0));//or, counting_iterator
+	
+      VectorB hash_rows;
+      size_t hash_sz = make_hash(vertexSubset, hash_rows, true);
+        
+      //step 4: create hash map between nz entry and corresponding 
+      // I[], J[], V[], Used[] SoA; update vals_
+      //
+      I.assign(nnz, IndexT(0));
+      J.assign(nnz, IndexT(0));
+      K.assign(nnz, IndexT(0));
+
+      VectorB U(nnz, ValueB(0));
+
+      size_t actual_nnz = fill_hash_nz2ijv(hash_rows, 
+					   range, 
+					   nzs, 
+					   offsets, 
+					   vertexSubset, 
+					   row_ptr_, 
+					   col_ind_, 
+					   I, J, K, U); 
+
+      //#####  Change 4: subg row_ptr extraction requires renumbering first #####
+      renumber_indices(vertexSubset, hash_t);
+
+      VectorI I_sg(actual_nnz, IndexT(0));
+      get_renumbered_indices(I,      //in: in=sources; 
+			     hash_t, //in: renumbering hash table
+			     I_sg);  //out:renumbered: sources[]
+
+#ifdef DEBUG_NEW
+      std::cout<<"I_sg: ";
+      print_v(I_sg, std::cout);
+
+      std::cout<<"nnz="<<actual_nnz<<std::endl;
+      std::cout<<"I.size()="<<I.size()<<std::endl;
+#endif
+	
+      //####################################  Change 2:
+      //step 5: extract subgraph CSR data:
+      //
+      make_subg_row_ptr(actual_nnz,
+			nrows_subg,
+			I_sg,
+			row_ptr_subg);
+	       
+      //step 6: update col_ind and re-number:
+      //
+      col_ind_subg.assign(actual_nnz, IndexT(0));
+
+      //####################################  Change 3:
+      get_renumbered_indices(J,            //in: in=sinks; 
+			     hash_t,       //in: renumbering hash table
+			     col_ind_subg);//out:renumbered: col_ind[]
+
+      //#####  Change 7: get edge subset from original graph #####
+      edgeSubset = K; // copy !!!
+
+      //act (or not) on values:
+      //
+      fctrv.update_from(K);
+    }
+
+    //#####  Change 6: separate logic for extraction by edges #####
+    //
+    template<typename ValUpdaterFctr>
+    void extract_subgraph_by_edge(VectorI& row_ptr,
+				  VectorI& col_ind,
+				  ValUpdaterFctr fctrv)
+    {
+      if( edgeSubset.empty() )
+	return; //nothing to do
+
+      size_t nedges = edgeSubset.size();
+
+      K = edgeSubset; // copy!!!
+
+      VectorI sinks0(nedges);
+
+      //get edge sinks:
+      //just extract the col_ind 
+      //values at indices specified by eSub:
+      //
+      //
+      //old solution...
+      // Filter<Container, ValT> filter(col_ind);
+      // thrust::transform(eSub.begin(), eSub.end(), 
+      // 		sinks0.begin(),
+      // 		filter);
+      //
+      //...replace with gather:
+      //
+      thrust::gather(edgeSubset.begin(), edgeSubset.end(), //range of indexes...
+		     col_ind.begin(),          //...into source
+		     sinks0.begin());          //destination (result)
+
+      //subg_col_ind[] = sink entries corresponding 
+      //to *sorted* source entries
+      //at this point both sources and sinks are sorted,
+      //but that doesn't mean that sinks[i] and sources[i] form edges...
+      //(use multi_sort_SoA?)
+      //
+      //Actually: since sources[] should come out sorted regardless of sinks[]
+      //the corresponding sinks[] are just sinks0[] before sorting it!
+      //
+      //J[] is just the unsorted sinks:
+      //
+      J = sinks0; // copy!!!
+  
+#ifdef DEBUG_EDGES
+      std::cout<<"sinks:";
+      print_v(J, std::cout);
+#endif
+
+      //sort sinks to later do a merge with them:
+      //
+      thrust::sort(sinks0.begin(), sinks0.end()); 
+
+      //hash[val_i] = i (index of first occurence of val_i, if val_i exists in v[]; 
+      //                 else, last occurence of closest value less than val_i):
+      //
+      //(not ot be confused with renumbering hash, hash_t)
+      //
+      VectorI hash;
+      reverse_hash(row_ptr, hash);
+
+#ifdef DEBUG_EDGES
+      std::cout<<"hash:";
+      print_v(hash, std::cout);
+#endif
+  
+      //now get sources:
+      //apply hash on eSub,
+      //i.e., extract the hash 
+      //values at indices specified by eSub:
+      //(the result should be sorted, 
+      // because eSub is assumed sorted
+      // and hash has indices of a sorted array: row_ptr)
+      //
+      I.assign(nedges, IndexT(0)); //I[] = sources !!!
+      //
+      //old solution...
+      // Filter<Container, ValT> hash_app(hash);
+      // thrust::transform(eSub.begin(), eSub.end(), 
+      // 		sources.begin(),
+      // 		hash_app);
+      //
+      //replaced by gather...
+      //
+      thrust::gather(edgeSubset.begin(), edgeSubset.end(), //range of indexes...
+		     hash.begin(),             //...into source
+		     I.begin());         //destination (result)
+
+      assert( sinks0.size() == I.size() );
+
+#ifdef DEBUG_EDGES
+      std::cout<<"sources:";
+      print_v(I, std::cout);
+#endif
+
+      //now merge sinks with sources
+      //
+      VectorI v(nedges<<1);//twice as many edges...
+      thrust::merge(sinks0.begin(), sinks0.end(),
+		    I.begin(), I.end(),
+		    v.begin());
+
+      size_t nrows_subg = count_get_distinct(v, vertexSubset);
+
+      //renumber row (vertex) indices:
+      //
+      renumber_indices(vertexSubset, hash_t);
+
+      get_renumbered_indices(I, //in: in=sources; 
+			     hash_t,  //in: renumbering hash table
+			     sinks0); //out:renumbered: sources[]
+
+      //create subgraph row_ptr,
+      //operating on sources:
+      //
+      row_ptr_subg.resize(nrows_subg+1);
+      make_subg_row_ptr(nedges,     //==actual_nnz
+			nrows_subg,
+			sinks0,
+			row_ptr_subg);
+ 
+      //renumber subg_col_ind:
+      //
+      col_ind_subg.resize(nedges);
+      get_renumbered_indices(J,             //in: in=sinks; 
+			     hash_t,        //in: renumbering hash table
+			     col_ind_subg); //out:renumbered: subg_col_ind[]
+
+      //act (or not) on values:
+      //
+      fctrv.update_from(K);
+    }
+
+  private:
+    VectorI vertexSubset;  //original graph vertex indices used in subgraph
+
+    //####################################  Change 7:
+    //
+    VectorI edgeSubset;    //original graph edge indices used in subgraph
+
+    
+    VectorV vals_subg;     //not used for non-valued graphs
+    VectorI row_ptr_subg;
+    VectorI col_ind_subg;
+
+    //useful for mapping graph <--> subgraph:
+    //
+    VectorI I;      //subgraph's set of (original graph) row indices
+    VectorI J;      //subgraph's set of (original graph) col indices
+                    //hence, (I[k], J[k]) is an edge in subgraph
+
+    VectorI K;      //subgraph's set of (original graph) edge indices 
+    
+    VectorI hash_t;
+
+    const bool is_vertex_extraction;
+  };
+
+
+
+
+
+
+  //Acyclic Visitor
+  //         (A. Alexandrescu, "Modern C++ Design", Section 10.4), 
+  //         where *concrete* Visitors must be parameterized by all 
+  //         the possibile template args of the Visited classes (visitees);
+  //
+
+  //Visitor for SubGraph extraction:
+  //
+  template<typename VectorI, 
+		   typename VectorV>
+  struct SubGraphExtractorVisitor: 
+    VisitorBase,
+    Visitor<Graph<typename VectorI::value_type> >,
+    Visitor<CsrGraph<typename VectorI::value_type> >,
+    Visitor<ValuedCsrGraph<typename VectorI::value_type, typename VectorV::value_type> >,
+    Visitor<MultiValuedCsrGraph<typename VectorI::value_type, typename VectorV::value_type> >
+  {
+    typedef typename VectorI::value_type IndexType_;
+    typedef typename VectorV::value_type ValueType_;
+    typedef typename VectorPtrT<typename VectorI::value_type,VectorI>::PtrT PtrI;
+
+    //TODO: avoid copy from raw pointer
+    //
+    SubGraphExtractorVisitor(CsrGraph<IndexType_>& graph,    
+			     const VectorI& vSub, 
+			     cudaStream_t stream):
+      row_ptr_(graph.get_raw_row_offsets(), graph.get_raw_row_offsets()+graph.get_num_vertices()+1),
+      col_ind_(graph.get_raw_column_indices(), graph.get_raw_column_indices()+graph.get_num_edges()),	  
+      extractor_(vSub),
+      stream_(stream)
+    {
+    }
+
+    //TODO: avoid copy from raw pointer
+    //
+    SubGraphExtractorVisitor(CsrGraph<IndexType_>& graph,
+			     const VectorI& eSub,       
+			     cudaStream_t stream,
+			     bool use_edges):     //just to differentiate vertex vs. edge semantics; value not used
+      row_ptr_(graph.get_raw_row_offsets(), graph.get_raw_row_offsets()+graph.get_num_vertices()+1),
+      col_ind_(graph.get_raw_column_indices(), graph.get_raw_column_indices()+graph.get_num_edges()),
+      extractor_(eSub, false),       //different semantics!
+      stream_(stream)
+    {
+    }  
+
+    void Visit(Graph<IndexType_>& graph)
+    {
+      //no-op...
+    }
+
+    void Visit(CsrGraph<IndexType_>& graph)
+    {
+      // size_t g_nrows = graph.get_num_vertices();
+      // size_t g_nnz = graph.get_num_edges();
+
+      // VectorI row_ptr(graph.get_raw_row_offsets(), graph.get_raw_row_offsets()+g_nrows+1);
+      // VectorI col_ind(graph.get_raw_column_indices(), graph.get_raw_column_indices()+g_nnz);
+
+      extractor_(row_ptr_, col_ind_);//TODO: modify operator to work directly with PtrI
+
+      size_t rowptr_sz = extractor_.get_row_ptr().size();
+      assert( rowptr_sz >= 1 );
+
+      size_t subg_nrows = rowptr_sz-1;
+      size_t subg_nnz = extractor_.get_subg_nnz();
+
+      subgraph_ = new CsrGraph<IndexType_>(subg_nrows, subg_nnz, stream_);
+
+      //TODO: more efficient solution: investigate if/how copy can be avoided
+      //
+      thrust::copy(extractor_.get_row_ptr().begin(), extractor_.get_row_ptr().end(), subgraph_->get_raw_row_offsets());
+      thrust::copy(extractor_.get_col_ind().begin(), extractor_.get_col_ind().end(), subgraph_->get_raw_column_indices());
+    }
+
+    //might not need to implement following Visit methods,
+    //the one above for CsrGraph might work for derived
+    //classes...
+    void Visit(ValuedCsrGraph<IndexType_,ValueType_>& graph)
+    {
+      size_t g_nrows = graph.get_num_vertices();
+      size_t g_nnz = graph.get_num_edges();
+
+      // VectorI row_ptr(graph.get_raw_row_offsets(), graph.get_raw_row_offsets()+g_nrows+1);
+      // VectorI col_ind(graph.get_raw_column_indices(), graph.get_raw_column_indices()+g_nnz);
+      VectorV vals(graph.get_raw_values(), graph.get_raw_values()+g_nnz);
+
+      extractor_(vals, row_ptr_, col_ind_);//TODO: modify operator to work directly with PtrI
+
+      size_t rowptr_sz = extractor_.get_row_ptr().size();
+      assert( rowptr_sz >= 1 );
+
+      size_t subg_nrows = rowptr_sz-1;
+      size_t subg_nnz = extractor_.get_subg_nnz();
+
+      ValuedCsrGraph<IndexType_,ValueType_>* subg = new ValuedCsrGraph<IndexType_,ValueType_>(subg_nrows, subg_nnz, stream_);
+
+      //TODO: more efficient solution: investigate if/how copy can be avoided
+      //
+      thrust::copy(extractor_.get_row_ptr().begin(), extractor_.get_row_ptr().end(), subg->get_raw_row_offsets());
+      thrust::copy(extractor_.get_col_ind().begin(), extractor_.get_col_ind().end(), subg->get_raw_column_indices());
+      thrust::copy(extractor_.get_vals().begin(), extractor_.get_vals().end(), subg->get_raw_values());
+
+      subgraph_ = subg;
+    }
+
+    void Visit(MultiValuedCsrGraph<IndexType_,ValueType_>& graph)
+    {
+      size_t g_nrows = graph.get_num_vertices();
+      size_t g_nnz = graph.get_num_edges();
+
+      // VectorI row_ptr(graph.get_raw_row_offsets(), graph.get_raw_row_offsets()+g_nrows+1);
+      // VectorI col_ind(graph.get_raw_column_indices(), graph.get_raw_column_indices()+g_nnz);
+      /// VectorV vals(graph.get_raw_values(), graph.get_raw_values()+g_nnz);
+
+	  ///extractor_(vals, row_ptr_, col_ind_);
+      extractor_(row_ptr_, col_ind_);//TODO: modify operator to work directly with PtrI
+
+      size_t rowptr_sz = extractor_.get_row_ptr().size();
+      assert( rowptr_sz >= 1 );
+
+      size_t subg_nrows = rowptr_sz-1;
+      size_t subg_nnz = extractor_.get_subg_nnz();
+      
+      MultiValuedCsrGraph<IndexType_,ValueType_>* subg = new MultiValuedCsrGraph<IndexType_,ValueType_>(subg_nrows, subg_nnz, stream_);
+
+      //TODO: more efficient solution: investigate if/how copy can be avoided
+      //
+      thrust::copy(extractor_.get_row_ptr().begin(), extractor_.get_row_ptr().end(), subg->get_raw_row_offsets());
+      thrust::copy(extractor_.get_col_ind().begin(), extractor_.get_col_ind().end(), subg->get_raw_column_indices());
+      ///thrust::copy(extractor_.get_vals().begin(), extractor_.get_vals().end(), subg->get_raw_values());
+
+      //additional data extraction:
+      //
+      get_vertex_data(graph, extractor_.get_vertex_subset(), *subg);
+      get_edge_data(graph, extractor_.get_K(), *subg);
+
+      subgraph_ = subg;
+    }
+
+    const SubGraphExtractorFunctor<VectorI, VectorV>& get_extractor(void) const
+    {
+      return extractor_;
+    }
+
+    CsrGraph<IndexType_>* get_subgraph(void) // TODO: change to unique_ptr, when moving to C++1*
+    {
+      return subgraph_;
+    }
+  protected:
+    void get_edge_data(MultiValuedCsrGraph<IndexType_,ValueType_>& graph_src,
+		       const VectorI& K, //subset of graph edge set
+		       MultiValuedCsrGraph<IndexType_,ValueType_>& graph_dest)
+    {
+      typedef thrust::device_ptr<ValueType_> PtrV;
+
+      size_t ng = graph_src.get_num_edge_dim();
+      size_t nedges = K.size();
+
+      assert( nedges == graph_dest.get_num_edges() );
+
+      graph_dest.allocateEdgeData(ng, stream_);
+      
+      for(unsigned int i=0;i<ng;++i)
+		{
+		  Vector<ValueType_>& v_src = graph_src.get_edge_dim(i);
+		  Vector<ValueType_>& v_dest = graph_dest.get_edge_dim(i);
+
+		  size_t n_src = v_src.get_size();
+		  PtrV ptr_src(v_src.raw());
+		  range_view<PtrV> rv_src(ptr_src, ptr_src+n_src);
+
+		  size_t n_dest = v_dest.get_size();
+		  assert( nedges == n_dest );
+
+		  PtrV ptr_dest(v_dest.raw());
+		  range_view<PtrV> rv_dest(ptr_dest, ptr_dest+n_dest);
+
+		  thrust::gather(K.begin(), K.end(), //map of indices
+						 rv_src.begin(),     //source
+						 rv_dest.begin());   //source[map]
+		}
+    }
+
+    void get_vertex_data(MultiValuedCsrGraph<IndexType_,ValueType_>& graph_src,
+			 const VectorI& K,// subset of graph vertex set == vSub
+			 MultiValuedCsrGraph<IndexType_,ValueType_>& graph_dest)
+    {
+      typedef thrust::device_ptr<ValueType_> PtrV;
+
+      size_t ng = graph_src.get_num_vertex_dim();
+      size_t nrows = K.size();//remember, K==vSub, here!
+
+      assert( nrows == graph_dest.get_num_vertices() );
+
+      graph_dest.allocateVertexData(ng, stream_);
+      
+      for(unsigned int i=0;i<ng;++i)
+		{
+		  Vector<ValueType_>& v_src = graph_src.get_vertex_dim(i);
+		  Vector<ValueType_>& v_dest = graph_dest.get_vertex_dim(i);
+
+		  size_t n_src = v_src.get_size();
+		  PtrV ptr_src(v_src.raw());
+		  range_view<PtrV> rv_src(ptr_src, ptr_src+n_src);
+
+		  size_t n_dest = v_dest.get_size();
+		  assert( nrows == n_dest );
+
+		  PtrV ptr_dest(v_dest.raw());
+		  range_view<PtrV> rv_dest(ptr_dest, ptr_dest+n_dest);
+
+		  thrust::gather(K.begin(), K.end(), //map of indices
+						 rv_src.begin(),     //source
+						 rv_dest.begin());   //source[map]
+		}
+    }
+  private:
+    VectorI row_ptr_;
+    VectorI col_ind_;
+    SubGraphExtractorFunctor<VectorI, VectorV> extractor_;
+    cudaStream_t stream_;
+    CsrGraph<IndexType_>* subgraph_; // to be constructed
+  };
+
+  template<typename T>
+  struct BoundValidator
+  {
+    BoundValidator(const T& lower_bound,
+		   const T& upper_bound):
+      lbound_(lower_bound),
+      ubound_(upper_bound)
+    {
+    }
+
+    __host__ __device__
+    bool operator() (T k)
+    {
+      return ( k < lbound_ || k > ubound_ );
+    }
+
+  private:
+    T lbound_;
+    T ubound_;
+  };
+
+  template<typename Container>
+  struct NotSortedAscendingly
+  {
+    typedef typename Container::value_type VType;
+    typedef typename VectorPtrT<VType,Container>::PtrT PtrT;
+
+    NotSortedAscendingly(Container& rv, const size_t& sz):
+      ptr_(&rv[0]),
+      sz_(sz)
+    {
+        
+    }
+    
+    __host__ __device__
+    bool operator() (VType k)
+    {
+      if( k+1 < sz_ )
+	return ptr_[k+1] < ptr_[k];
+      else
+	return false;
+    }
+  private:
+    PtrT ptr_;//no reference! must be copy constructed
+    size_t sz_;
+  };
+
+  template<typename VectorI>
+  void validate_input(VectorI& v, typename VectorI::value_type sz)
+  {
+    typedef typename VectorI::value_type IndexT;
+
+    size_t n = v.size();
+
+    if( n == 0 )
+      FatalError("0-sized array input in subgraph extraction.",NVGRAPH_ERR_BAD_PARAMETERS);
+
+    IndexT lb = 0;
+    IndexT ub = sz-1;
+    BoundValidator<IndexT> bvld(lb, ub);//closed interval!
+    typename VectorI::iterator pos = thrust::find_if(v.begin(), v.end(), bvld);
+    if( pos != v.end() )
+      FatalError("Input is not a valid subset of the graph's corresponding set.",NVGRAPH_ERR_BAD_PARAMETERS);
+
+    VectorI seq(n,0);
+    thrust::sequence(seq.begin(), seq.end());
+    NotSortedAscendingly<VectorI> nsa_f(v, n);
+    pos = thrust::find_if(seq.begin(), seq.end(), nsa_f);
+    if( pos != seq.end() )
+      FatalError("Input array not sorted in ascending order.",NVGRAPH_ERR_BAD_PARAMETERS);
+
+    pos = thrust::unique(v.begin(), v.end());
+    if( pos != v.end() )
+      FatalError("Input array has duplicates.",NVGRAPH_ERR_BAD_PARAMETERS);
+	
+  }
+
+  template<typename IndexT, typename ValueT>
+  CsrGraph<IndexT>* extract_from_vertex_subset(CsrGraph<IndexT>& graph, 
+					       IndexT* pV, size_t n, cudaStream_t stream)
+  {
+    typedef thrust::device_vector<IndexT> VectorI;
+    typedef thrust::device_vector<ValueT> VectorV;
+    VectorI vSub(pV, pV+n);
+
+    validate_input(vSub, graph.get_num_vertices());
+
+    SubGraphExtractorVisitor<VectorI, VectorV> visitor(graph, vSub, stream);
+    graph.Accept(visitor);
+    return visitor.get_subgraph();
+  }
+
+  template<typename IndexT, typename ValueT>
+  CsrGraph<IndexT>* extract_from_edge_subset(CsrGraph<IndexT>& graph, 
+					     IndexT* pV, size_t n, cudaStream_t stream)
+  {
+    typedef thrust::device_vector<IndexT> VectorI;
+    typedef thrust::device_vector<ValueT> VectorV;
+    VectorI vSub(pV, pV+n);
+
+    validate_input(vSub, graph.get_num_edges());
+
+    SubGraphExtractorVisitor<VectorI, VectorV> visitor(graph, vSub, stream, true);
+    graph.Accept(visitor);
+    return visitor.get_subgraph();
+  }
+  
+}//end namespace
+
+#endif
diff --git a/cpp/nvgraph/cpp/include/graph_contracting_structs.hxx b/cpp/nvgraph/cpp/include/graph_contracting_structs.hxx
new file mode 100644
index 00000000000..36d3fced642
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/graph_contracting_structs.hxx
@@ -0,0 +1,2245 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GRAPH_CONTRACTING_STRUCTS_HXX
+#define GRAPH_CONTRACTING_STRUCTS_HXX
+
+#include <nvgraph_error.hxx>
+#include <multi_valued_csr_graph.hxx> //which includes all other headers... 
+#include <range_view.hxx> // TODO: to be changed to thrust/range_view.h, when toolkit gets in sync with Thrust
+
+#include <thrust_traits.hxx>
+
+//from amgx/amg/base/include/sm_utils.inl
+//{
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
+#define __PTR   "l"
+#else
+#define __PTR   "r"
+#endif
+//}
+
+namespace nvgraph
+{
+  //from amgx/amg/base/include/sm_utils.inl
+  //{
+  namespace utils
+  {
+
+
+	// ====================================================================================================================
+	// Warp tools.
+	// ====================================================================================================================
+
+	static __device__ __forceinline__ int lane_id() 
+	{
+	  int id;
+	  asm( "mov.u32 %0, %%laneid;" : "=r"(id) );
+	  return id;
+	}
+
+	static __device__ __forceinline__ int lane_mask_lt() 
+	{
+	  int mask;
+	  asm( "mov.u32 %0, %%lanemask_lt;" : "=r"(mask) );
+	  return mask;
+	}
+
+	static __device__ __forceinline__ int warp_id() 
+	{
+	  return threadIdx.x >> 5;
+	}
+
+
+	// ====================================================================================================================
+	// Atomics.
+	// ====================================================================================================================
+	static __device__ __forceinline__ void atomic_add( float *address, float value )
+	{
+	  atomicAdd( address, value );
+	}
+
+	static __device__ __forceinline__ void atomic_add( double *address, double value )
+	{
+	  unsigned long long *address_as_ull = (unsigned long long *) address; 
+	  unsigned long long old = __double_as_longlong( address[0] ), assumed; 
+	  do { 
+		assumed = old; 
+		old = atomicCAS( address_as_ull, assumed, __double_as_longlong( value + __longlong_as_double( assumed ) ) ); 
+	  } 
+	  while( assumed != old ); 
+	}
+
+
+	// ====================================================================================================================
+	// Bit tools.
+	// ====================================================================================================================
+
+	static __device__ __forceinline__ int bfe( int src, int num_bits ) 
+	{
+	  unsigned mask;
+	  asm( "bfe.u32 %0, %1, 0, %2;" : "=r"(mask) : "r"(src), "r"(num_bits) );
+	  return mask;
+	}
+
+	static __device__ __forceinline__ int bfind( int src ) 
+	{
+	  int msb;
+	  asm( "bfind.u32 %0, %1;" : "=r"(msb) : "r"(src) );
+	  return msb;
+	}
+
+	static __device__ __forceinline__ int bfind( unsigned long long src ) 
+	{
+	  int msb;
+	  asm( "bfind.u64 %0, %1;" : "=r"(msb) : "l"(src) );
+	  return msb;
+	}
+
+
+
+	// ====================================================================================================================
+	// Shuffle.
+	// ====================================================================================================================
+	static __device__ __forceinline__ float shfl( float r, int lane, int bound = 32)
+	{
+#if __CUDA_ARCH__ >= 300
+	  return __shfl( r, lane, bound );
+#else
+	  return 0.0f;
+#endif
+	}
+
+	static __device__ __forceinline__ double shfl( double r, int lane, int bound=32 )
+	{
+#if __CUDA_ARCH__ >= 300
+	  int hi = __shfl( __double2hiint(r), lane, bound );
+	  int lo = __shfl( __double2loint(r), lane, bound );
+	  return __hiloint2double( hi, lo );
+#else
+	  return 0.0;
+#endif
+	}
+
+        static __device__ __forceinline__ float shfl_xor( float r, int mask, int bound=32 )
+	{
+#if __CUDA_ARCH__ >= 300
+	  return __shfl_xor( r, mask, bound );
+#else
+	  return 0.0f;
+#endif
+	}
+
+        static __device__ __forceinline__ double shfl_xor( double r, int mask, int bound=32 )
+	{
+#if __CUDA_ARCH__ >= 300
+	  int hi = __shfl_xor( __double2hiint(r), mask, bound );
+	  int lo = __shfl_xor( __double2loint(r), mask, bound );
+	  return __hiloint2double( hi, lo );
+#else
+	  return 0.0;
+#endif
+	}
+
+
+
+	// ====================================================================================================================
+	// Loads.
+	// ====================================================================================================================
+
+	enum Ld_mode { LD_AUTO = 0, LD_CA, LD_CG, LD_TEX, LD_NC };
+
+	template< Ld_mode Mode >
+	struct Ld {};
+
+	template<>
+	struct Ld<LD_AUTO> 
+	{ 
+	  template< typename T >
+	  static __device__ __forceinline__ T load( const T *ptr ) { return *ptr; }
+	};
+
+	template<>
+	struct Ld<LD_CG> 
+	{ 
+	  static __device__ __forceinline__ int load( const int *ptr ) 
+	  { 
+		int ret; 
+		asm volatile ( "ld.global.cg.s32 %0, [%1];"  : "=r"(ret) : __PTR(ptr) ); 
+		return ret; 
+	  }
+  
+	  static __device__ __forceinline__ float load( const float *ptr ) 
+	  { 
+		float ret; 
+		asm volatile ( "ld.global.cg.f32 %0, [%1];"  : "=f"(ret) : __PTR(ptr) ); 
+		return ret; 
+	  }
+  
+	  static __device__ __forceinline__ double load( const double *ptr ) 
+	  { 
+		double ret; 
+		asm volatile ( "ld.global.cg.f64 %0, [%1];"  : "=d"(ret) : __PTR(ptr) ); 
+		return ret; 
+	  }
+
+	};
+
+	template<>
+	struct Ld<LD_CA> 
+	{ 
+	  static __device__ __forceinline__ int load( const int *ptr ) 
+	  { 
+		int ret; 
+		asm volatile ( "ld.global.ca.s32 %0, [%1];"  : "=r"(ret) : __PTR(ptr) ); 
+		return ret; 
+	  }
+  
+	  static __device__ __forceinline__ float load( const float *ptr ) 
+	  { 
+		float ret; 
+		asm volatile ( "ld.global.ca.f32 %0, [%1];"  : "=f"(ret) : __PTR(ptr) ); 
+		return ret; 
+	  }
+  
+	  static __device__ __forceinline__ double load( const double *ptr ) 
+	  { 
+		double ret; 
+		asm volatile ( "ld.global.ca.f64 %0, [%1];"  : "=d"(ret) : __PTR(ptr) ); 
+		return ret; 
+	  }
+	};
+
+	template<>
+	struct Ld<LD_NC> 
+	{ 
+	  template< typename T >
+	  static __device__ __forceinline__ T load( const T *ptr ) { return __ldg( ptr ); }
+	};
+
+
+	template < typename T, typename POD_TYPE = T >
+    struct util;
+   
+    template <>
+    struct util <float,  float >
+    {
+        typedef double uptype;
+        typedef float downtype;
+
+        static const bool is_real = true;
+        static const bool is_complex = false;
+
+        static __host__ __device__ __inline__ float get_zero(){ return 0.f; }
+        static __host__ __device__ __inline__ float get_one(){ return 1.f; }
+        static __host__ __device__ __inline__ float get_minus_one(){ return -1.f; }
+        // exact comaprison, which might result wrong answer in a lot of cases
+        static __host__ __device__ __inline__ bool is_zero(const float& val){ return val == get_zero(); }
+        static __host__ __device__ __inline__ bool is_equal(const float& val1, const float& val2) { return val1 == val2;} ;
+        
+        static __host__ __device__ __inline__ float invert(const float& val) {return -val;}
+        static __host__ __device__ __inline__ float conjugate(const float& val) {return val;}
+        static __host__ __device__ __inline__ void  invert_inplace(float& val) {val = -val;}
+        static __host__ __device__ __inline__ void  conjugate_inplace(float& val) {}
+
+        static __host__ __device__ __inline__ float abs (const float& val)
+        {
+            return fabs(val);
+        }
+
+        template <typename V>
+        static __host__ __device__ __inline__ void to_uptype (const float& src, V& dst)
+        {
+            dst = (V)(src);
+        }
+
+        static __host__ __device__ __inline__ float to_downtype (const float& src)
+        {
+            return src;
+        }
+
+        static __host__ __device__ __inline__ float volcast (const volatile    float& val) {return val;}
+        static __host__ __device__ __inline__ void  volcast (const float& val, volatile float* ret) {*ret = val;}
+
+        /*template <typename M>
+        static __host__ __device__ __inline__ float mul(const float& val, const M& mult)
+        { 
+            static_assert(util<M>::is_real(), "Multiply is supported for real constant only"); 
+            return val*mult;
+        }*/
+        
+        static void printf(const char* fmt, const float& val) { ::printf(fmt, val); }
+        static void fprintf(FILE* f, const char* fmt, const float& val) { ::fprintf(f, fmt, val); }
+    };
+
+    template <>
+    struct util <double, double>
+    {
+        typedef double uptype;
+        typedef float downtype;
+
+        static const bool is_real = true;
+        static const bool is_complex = false;
+        
+        static __host__ __device__ __inline__ double get_zero(){ return 0.; }
+        static __host__ __device__ __inline__ double get_one(){ return 1.; }
+        static __host__ __device__ __inline__ double get_minus_one(){ return -1.; }
+        
+        static __host__ __device__ __inline__ bool is_zero(const double& val){ return val == get_zero(); }
+        static __host__ __device__ __inline__ bool is_equal(const double& val1, double& val2) { return val1 == val2;} ;
+
+        static __host__ __device__ __inline__ double invert(const double& val) {return -val;}
+        static __host__ __device__ __inline__ double conjugate(const double& val) {return val;}
+        static __host__ __device__ __inline__ void invert_inplace(double& val) {val = -val;}
+        static __host__ __device__ __inline__ void conjugate_inplace(double& val) {}
+
+        static __host__ __device__ __inline__ double abs (const double& val)
+        {
+            return fabs(val);
+        }
+
+        template <typename V>
+        static __host__ __device__ __inline__ void to_uptype (const float& src, V& dst)
+        {
+            dst = (V)(src);
+        }
+
+        static __host__ __device__ __inline__ float to_downtype (const float& src)
+        {
+            return (float)src;
+        }
+
+        static __host__ __device__ __inline__ double volcast (const volatile   double& val) {return val;}
+        static __host__ __device__ __inline__ void   volcast (const double& val, volatile double* ret) {*ret = val;}
+
+        /*
+        template <typename M>
+        static __host__ __device__ __inline__ double mulf(const double& val, const M& mult) 
+        { 
+            static_assert(util<M>::is_real(), "Multiply is supported for real constant only"); 
+            return val*mult;
+        }*/
+        
+        static void printf(const char* fmt, const double& val) { ::printf(fmt, val); }
+        static void fprintf(FILE* f, const char* fmt,const double& val) { ::fprintf(f, fmt, val); }
+    };
+
+
+	// ====================================================================================================================
+	// Warp-level reductions.
+	// ====================================================================================================================
+
+	struct Add
+	{
+	  template< typename Value_type >
+	  static __device__ __forceinline__ Value_type eval( Value_type x, Value_type y ) { return x+y; }
+	};
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+
+	template< int NUM_THREADS_PER_ITEM, int WARP_SIZE >
+	struct Warp_reduce_pow2
+	{
+	  template< typename Operator, typename Value_type >
+	  static __device__ __inline__ Value_type execute( Value_type x )
+	  {
+#pragma unroll
+		for( int mask = WARP_SIZE / 2 ; mask >= NUM_THREADS_PER_ITEM ; mask >>= 1 )
+		  x = Operator::eval( x, shfl_xor(x, mask) );
+		return x;
+	  }
+	};
+
+	template< int NUM_THREADS_PER_ITEM, int WARP_SIZE >
+	struct Warp_reduce_linear
+	{
+	  template< typename Operator, typename Value_type >
+	  static __device__ __inline__ Value_type execute( Value_type x )
+	  {
+		const int NUM_STEPS = WARP_SIZE / NUM_THREADS_PER_ITEM;
+		int my_lane_id = utils::lane_id();
+#pragma unroll
+		for( int i = 1 ; i < NUM_STEPS ; ++i )
+		  {
+			Value_type y = shfl_down( x, i*NUM_THREADS_PER_ITEM );
+			if( my_lane_id < NUM_THREADS_PER_ITEM )
+			  x = Operator::eval( x, y );
+		  }
+		return x;
+	  }
+	};
+
+#else
+
+	template< int NUM_THREADS_PER_ITEM, int WARP_SIZE >
+	struct Warp_reduce_pow2
+	{
+	  template< typename Operator, typename Value_type >
+	  static __device__ __inline__ Value_type execute( volatile Value_type *smem, Value_type x )
+	  {
+		int my_lane_id = utils::lane_id();
+#pragma unroll
+		for( int offset = WARP_SIZE / 2 ; offset >= NUM_THREADS_PER_ITEM ; offset >>= 1 )
+		  if( my_lane_id < offset )
+			{
+			  x = Operator::eval( x, smem[threadIdx.x+offset] );
+			  util<Value_type>::volcast(x, smem + threadIdx.x);
+			}
+		return x;
+	  }
+	};
+
+	template< int NUM_THREADS_PER_ITEM, int WARP_SIZE >
+	struct Warp_reduce_linear
+	{
+	  template< typename Operator, typename Value_type >
+	  static __device__ __inline__ Value_type execute( volatile Value_type *smem, Value_type x )
+	  {
+		const int NUM_STEPS = WARP_SIZE / NUM_THREADS_PER_ITEM;
+		int my_lane_id = utils::lane_id();
+#pragma unroll
+		for( int i = 1 ; i < NUM_STEPS ; ++i )
+		  if( my_lane_id < NUM_THREADS_PER_ITEM )
+			{
+			  x = Operator::eval( x, smem[threadIdx.x+i*NUM_THREADS_PER_ITEM] );
+			  util<Value_type>::volcast(x, smem + threadIdx.x);
+			}
+		return x;
+	  }
+	};
+
+#endif
+
+	// ====================================================================================================================
+
+	template< int NUM_THREADS_PER_ITEM, int WARP_SIZE = 32 >
+	struct Warp_reduce : public Warp_reduce_pow2<NUM_THREADS_PER_ITEM, WARP_SIZE> {};
+
+	template< int WARP_SIZE >
+	struct Warp_reduce< 3, WARP_SIZE> : public Warp_reduce_linear< 3, WARP_SIZE> {};
+
+	template< int WARP_SIZE >
+	struct Warp_reduce< 4, WARP_SIZE> : public Warp_reduce_linear< 4, WARP_SIZE> {};
+
+	template< int WARP_SIZE >
+	struct Warp_reduce< 5, WARP_SIZE> : public Warp_reduce_linear< 5, WARP_SIZE> {};
+
+	template< int WARP_SIZE >
+	struct Warp_reduce< 6, WARP_SIZE> : public Warp_reduce_linear< 6, WARP_SIZE> {};
+
+	template< int WARP_SIZE >
+	struct Warp_reduce< 7, WARP_SIZE> : public Warp_reduce_linear< 7, WARP_SIZE> {};
+
+	template< int WARP_SIZE >
+	struct Warp_reduce< 9, WARP_SIZE> : public Warp_reduce_linear< 9, WARP_SIZE> {};
+
+	template< int WARP_SIZE >
+	struct Warp_reduce<10, WARP_SIZE> : public Warp_reduce_linear<10, WARP_SIZE> {};
+
+	template< int WARP_SIZE >
+	struct Warp_reduce<11, WARP_SIZE> : public Warp_reduce_linear<11, WARP_SIZE> {};
+
+	template< int WARP_SIZE >
+	struct Warp_reduce<12, WARP_SIZE> : public Warp_reduce_linear<12, WARP_SIZE> {};
+
+	template< int WARP_SIZE >
+	struct Warp_reduce<13, WARP_SIZE> : public Warp_reduce_linear<13, WARP_SIZE> {};
+
+	template< int WARP_SIZE >
+	struct Warp_reduce<14, WARP_SIZE> : public Warp_reduce_linear<14, WARP_SIZE> {};
+
+	template< int WARP_SIZE >
+	struct Warp_reduce<15, WARP_SIZE> : public Warp_reduce_linear<15, WARP_SIZE> {};
+
+	// ====================================================================================================================
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+
+	template< int NUM_THREADS_PER_ITEM, typename Operator, typename Value_type >
+	static __device__ __forceinline__ Value_type warp_reduce( Value_type x )
+	{
+	  return Warp_reduce<NUM_THREADS_PER_ITEM>::template execute<Operator>( x );
+	}
+
+#else
+
+	template< int NUM_THREADS_PER_ITEM, typename Operator, typename Value_type >
+	static __device__ __forceinline__ Value_type warp_reduce( volatile Value_type *smem, Value_type x )
+	{
+	  return Warp_reduce<NUM_THREADS_PER_ITEM>::template execute<Operator>( smem, x );
+	}
+
+	template< int NUM_THREADS_PER_ITEM, typename Value_type, int WARP_SIZE >
+	static __device__ __forceinline__ Value_type warp_reduce_sum(volatile Value_type *smem, Value_type x)
+	{
+	  const int NUM_STEPS = WARP_SIZE / NUM_THREADS_PER_ITEM;
+	  int my_lane_id = utils::lane_id();
+#pragma unroll
+	  for (int i = 1; i < NUM_STEPS; ++i)
+		if (my_lane_id < NUM_THREADS_PER_ITEM)
+		  {
+			x = x + util<Value_type>::volcast(smem[threadIdx.x + i*NUM_THREADS_PER_ITEM]);
+			util<Value_type>::volcast(x, smem + threadIdx.x);
+		  }
+	  return x;
+	}
+
+#endif
+
+
+	
+  }//namespace utils
+  //}
+
+
+  template< typename Key_type, int SMEM_SIZE=128, int WARP_SIZE=32 >
+  class Hash_index
+  {
+  public:
+    // The number of registers needed to store the index. 
+    enum { REGS_SIZE = SMEM_SIZE / WARP_SIZE };
+
+    //private:
+    // The partial sums of the index (stored in registers).
+    int m_partial[REGS_SIZE];
+    // The index in GMEM.
+    int *m_gmem;
+
+  public:
+    // Create an index (to be associated with a hash set).
+    __device__ __forceinline__ Hash_index( int *gmem ) : m_gmem(gmem) {}
+
+    // Build the index from a SMEM buffer of size SMEM_SIZE.
+    __device__ __forceinline__ void build_smem_index( const volatile Key_type *s_buffer );
+    // Given an offset in SMEM, it finds the index.
+    __device__ __forceinline__ int find_smem( int offset ) const;
+    // Given an offset in GMEM, it finds the index.
+    __device__ __forceinline__ int find_gmem( int offset ) const;
+    // Set an indexed item in GMEM.
+    __device__ __forceinline__ void set_gmem_index( int offset, int val ) { m_gmem[offset] = val; }
+  };
+
+  // ====================================================================================================================
+
+  template< typename Key_type, int SMEM_SIZE, int WARP_SIZE >
+  __device__ __forceinline__ 
+  void 
+  Hash_index<Key_type, SMEM_SIZE, WARP_SIZE>::build_smem_index( const volatile Key_type *s_buffer )
+  {
+    const int lane_id = utils::lane_id();
+#pragma unroll
+    for( int i = 0, offset = lane_id ; i < REGS_SIZE ; ++i, offset += WARP_SIZE )
+      m_partial[i] = __ballot( s_buffer[offset] != -1 );
+  }
+
+  // ====================================================================================================================
+
+  template< typename Key_type, int SMEM_SIZE, int WARP_SIZE >
+  __device__ __forceinline__ 
+  int 
+  Hash_index<Key_type, SMEM_SIZE, WARP_SIZE>::find_smem( int offset ) const
+  {
+    const int offset_div_warp_size = offset / WARP_SIZE;
+    const int offset_mod_warp_size = offset % WARP_SIZE;
+
+    int result = 0;
+#pragma unroll
+    for( int i = 0 ; i < REGS_SIZE ; ++i )
+      {
+	int mask = 0xffffffff;
+	if( i == offset_div_warp_size )
+	  mask = (1 << offset_mod_warp_size) - 1;
+	if( i <= offset_div_warp_size )
+	  result += __popc( m_partial[i] & mask );
+      }
+    return result;
+  }
+
+  template< typename Key_type, int SMEM_SIZE, int WARP_SIZE >
+  __device__ __forceinline__ 
+  int 
+  Hash_index<Key_type, SMEM_SIZE, WARP_SIZE>::find_gmem( int offset ) const
+  {
+    return m_gmem[offset];
+  }
+
+
+  
+  static __constant__ unsigned c_hash_keys[] = 
+    { 
+      3499211612,  581869302, 3890346734, 3586334585,  
+      545404204,  4161255391, 3922919429,  949333985,
+      2715962298, 1323567403,  418932835, 2350294565, 
+      1196140740,  809094426, 2348838239, 4264392720 
+    };
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  template< typename Key_type, int SMEM_SIZE=128, int NUM_HASH_FCTS=4, int WARP_SIZE=32 >
+  class Hash_set
+  {
+    // Associated index.
+    typedef Hash_index<Key_type, SMEM_SIZE, WARP_SIZE> Index;
+
+  protected:
+    // The size of the table (occupancy).
+    int m_smem_count, m_gmem_count;
+    // The keys stored in the hash table.
+    volatile Key_type *m_smem_keys, *m_gmem_keys; 
+    // The size of the global memory buffer.
+    const int m_gmem_size;
+    // Is it ok?
+    bool m_fail;
+
+    // DEBUG
+    // bool m_print;
+    // END OF DEBUG.
+  
+  public:
+    // Constructor.
+    __device__ __forceinline__ Hash_set( volatile Key_type *smem_keys, volatile Key_type *gmem_keys, int gmem_size ) :
+      m_smem_count(0),
+      m_gmem_count(1),
+      m_smem_keys (smem_keys),
+      m_gmem_keys (gmem_keys),
+      m_gmem_size (gmem_size),
+      m_fail      (false)
+
+      // DEBUG
+      // , m_print(true)
+      // END OF DEBUG
+    {}
+  
+    // Clear the table.
+    __device__ __forceinline__ void clear( bool skip_gmem = false );
+    // Compute the size of the table. Only thread with lane_id==0 gives the correct result (no broadcast of the value).
+    __device__ __forceinline__ int compute_size();
+    // Compute the size of the table. Only thread with lane_id==0 gives the correct result (no broadcast of the value).
+    __device__ __forceinline__ int compute_size_with_duplicates();
+    // Does the set contain those values?
+    __device__ __forceinline__ bool contains( Key_type key ) const;
+    // Find an index.
+    __device__ __forceinline__ int find_index( Key_type key, const Index &index, bool print_debug ) const;
+    // Has the process failed.
+    __device__ __forceinline__ bool has_failed() const { return m_fail; }
+    // Insert a key inside the set. If status is NULL, ignore failure.
+    __device__ __forceinline__ void insert( Key_type key, int *status );
+    // Load a set.
+    __device__ __forceinline__ void load( int count, const Key_type *keys, const int *pos );
+    // Load a set and use it as an index. 
+    __device__ __forceinline__ void load_index( int count, const Key_type *keys, const int *pos, Index &index, bool print_debug );
+    // Store a set.
+    __device__ __forceinline__ void store( int count, Key_type *keys );
+    // Store a set.
+    __device__ __forceinline__ int  store_with_positions( Key_type *keys, int *pos );
+    // Store a set.
+    __device__ __forceinline__ int  store( Key_type *keys );
+  };
+
+  // ====================================================================================================================
+
+  template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE>
+  __device__ __forceinline__ 
+  void Hash_set<Key_type, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::clear( bool skip_gmem )
+  {
+    int lane_id = utils::lane_id();
+  
+    const int NUM_STEPS = SMEM_SIZE / WARP_SIZE;
+#pragma unroll
+    for( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step )
+      m_smem_keys[i_step*WARP_SIZE + lane_id] = -1;
+    m_smem_count = 0;
+  
+    if( skip_gmem || m_gmem_count == 0 )
+      {
+	m_gmem_count = 0;
+	return;
+      }
+    
+#pragma unroll 4
+    for( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE )
+      m_gmem_keys[offset] = -1;
+    m_gmem_count = 0;
+  }
+
+  // ====================================================================================================================
+
+  template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE>
+  __device__ __forceinline__ 
+  int Hash_set<Key_type, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::compute_size()
+  {
+    m_smem_count += m_gmem_count;
+#pragma unroll
+    for( int offset = WARP_SIZE/2 ; offset > 0 ; offset >>= 1 )
+      m_smem_count += __shfl_xor( m_smem_count, offset );
+    m_gmem_count = __any( m_gmem_count > 0 );
+    return m_smem_count;
+  }
+
+  // ====================================================================================================================
+
+  template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE>
+  __device__ __forceinline__ 
+  int Hash_set<Key_type, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::compute_size_with_duplicates()
+  {
+    int lane_id = utils::lane_id();
+
+    // Count the number of keys in SMEM.
+    int sum = 0;
+    const int NUM_STEPS = SMEM_SIZE / WARP_SIZE;
+#pragma unroll
+    for( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step )
+      {
+	const int offset = i_step*WARP_SIZE + lane_id;
+	Key_type key = m_smem_keys[offset];
+	sum += __popc( __ballot( key != -1 ) );
+      }
+
+    // Is there any key in GMEM. If not, just quit.
+    m_gmem_count = __any(m_gmem_count > 0);
+    if( !m_gmem_count )
+      return sum;
+
+    // Count the number of keys in GMEM.
+#pragma unroll 4
+    for( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE )
+      {
+	Key_type key = m_gmem_keys[offset];
+	sum += __popc( __ballot( key != -1 ) );
+      }
+    return sum;
+  }
+
+  // ====================================================================================================================
+
+  template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE>
+  __device__ __forceinline__ 
+  bool Hash_set<Key_type, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::contains( Key_type key ) const
+  {
+    bool done = key == -1, found = false;
+#pragma unroll
+    for( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash )
+      {
+	if( __all(done) )
+	  return found;
+	unsigned ukey = reinterpret_cast<unsigned&>( key );
+	int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (SMEM_SIZE-1);
+	if( !done )
+	  {
+	    Key_type stored_key = m_smem_keys[hash];
+	    if( stored_key == key )
+	      found = true;
+	    if( found || stored_key == -1 )
+	      done = true;
+	  }
+      }
+
+    const int num_bits = utils::bfind( m_gmem_size ); // TODO: move it outside ::insert.
+#pragma unroll
+    for( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash )
+      {
+	if( __all(done) )
+	  return found;
+	unsigned ukey = reinterpret_cast<unsigned&>( key );
+	int hash = utils::bfe( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash], num_bits );
+	if( !done )
+	  {
+	    Key_type stored_key = m_gmem_keys[hash];
+	    if( stored_key == key )
+	      found = true;
+	    if( found || stored_key == -1 )
+	      done = true;
+	  }
+      }
+    return found;
+  }
+
+  // ====================================================================================================================
+
+  template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
+  __device__ __forceinline__ 
+  int Hash_set<Key_type, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::find_index( Key_type key, const Index &index, bool print_debug ) const
+  {
+    int idx = -1;
+    bool done = key == -1;
+#pragma unroll
+    for( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash )
+      {
+	if( __all(done) )
+	  return idx;
+	unsigned ukey = reinterpret_cast<unsigned&>( key );
+	int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (SMEM_SIZE-1);
+	int result = index.find_smem(hash);
+	if( !done )
+	  {
+	    Key_type stored_key = m_smem_keys[hash];
+	    if( stored_key == key )
+	      {
+		idx = result;
+		done = true;
+	      }
+	  }
+      }
+
+    const int num_bits = utils::bfind( m_gmem_size ); // TODO: move it outside ::insert.
+#pragma unroll
+    for( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash )
+      {
+	if( __all(done) )
+	  return idx;
+	unsigned ukey = reinterpret_cast<unsigned&>( key );
+	int hash = utils::bfe( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash], num_bits );
+	if( !done )
+	  {
+	    Key_type stored_key = m_gmem_keys[hash];
+	    if( stored_key == key )
+	      {
+		idx = index.find_gmem(hash);
+		done = true;
+	      }
+	  }
+      }
+
+    // if( key != -1 && idx == -1 )
+    //   printf( "ERROR: Couldn't find the index!!!!\n");
+    return idx;
+  }
+
+  // ====================================================================================================================
+
+  template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
+  __device__ __forceinline__ 
+  void Hash_set<Key_type, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::insert( Key_type key, int *status )
+  {
+    bool done = key == -1;
+#pragma unroll
+    for( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash )
+      {
+	if( __all(done) )
+	  return;
+	bool candidate = false;
+	unsigned ukey = reinterpret_cast<unsigned&>( key );
+	int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (SMEM_SIZE-1);
+	if( !done )
+	  {
+	    Key_type stored_key = m_smem_keys[hash];
+	    if( stored_key == key )
+	      done = true;
+	    candidate = stored_key == -1;
+	    if( candidate )
+	      m_smem_keys[hash] = key;
+	    if( candidate && key == m_smem_keys[hash] ) // More than one candidate may have written to that slot.
+	      {
+		m_smem_count++;
+		done = true;
+	      }
+	  }
+      }
+
+    const int num_bits = utils::bfind( m_gmem_size ); // TODO: move it outside ::insert.
+#pragma unroll
+    for( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash )
+      {
+	if( __all(done) )
+	  return;
+	bool candidate = false;
+	unsigned ukey = reinterpret_cast<unsigned&>( key );
+	int hash = utils::bfe( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash], num_bits );
+	if( !done )
+	  {
+	    Key_type stored_key = m_gmem_keys[hash];
+	    if( stored_key == key )
+	      done = true;
+	    candidate = stored_key == -1;
+	    if( candidate )
+	      m_gmem_keys[hash] = key;
+	    if( candidate && key == m_gmem_keys[hash] ) // More than one candidate may have written to that slot.
+	      {
+		m_gmem_count++;
+		done = true;
+	      }
+	  }
+      }
+
+    if( __all(done) )
+      return;
+    assert( status != NULL );
+    if( utils::lane_id() == 0 )
+      *status = 1;
+    m_fail = true;
+  }
+
+  // ====================================================================================================================
+
+  template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
+  __device__ __forceinline__ 
+  void Hash_set<Key_type, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::load( int count, const Key_type *keys, const int *pos )
+  {
+    int lane_id = utils::lane_id();
+
+#pragma unroll 4
+    for( int offset = lane_id ; offset < count ; offset += WARP_SIZE )
+      {
+	Key_type key = keys[offset];
+	int idx = pos [offset];
+
+	// Where to store the item.
+	volatile Key_type *ptr = m_smem_keys;
+	if( idx >= SMEM_SIZE )
+	  {
+	    ptr = m_gmem_keys;
+	    m_gmem_count = 1;
+	    idx -= SMEM_SIZE;
+	  }
+
+	// Store the item.
+	ptr[idx] = key;
+      }
+    m_gmem_count = __any( m_gmem_count );
+  }
+
+  // ====================================================================================================================
+
+  template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
+  __device__ __forceinline__ 
+  void Hash_set<Key_type, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::load_index( int count, const Key_type *keys, const int *pos, Index &index, bool print_debug )
+  {
+#pragma unroll 4
+    for( int offset = utils::lane_id() ; offset < count ; offset += WARP_SIZE  )
+      {
+	Key_type key = keys[offset];
+	int idx = pos [offset];
+
+	// Store the item.
+	volatile Key_type *ptr = m_smem_keys;
+	if( idx >= SMEM_SIZE )
+	  {
+	    ptr = m_gmem_keys;
+	    m_gmem_count = 1;
+	    idx -= SMEM_SIZE;
+	    index.set_gmem_index( idx, offset );
+	  }
+
+	// Store the item.
+	ptr[idx] = key;
+      }
+
+    // Build the local index.
+    index.build_smem_index( m_smem_keys );
+    m_gmem_count = __any( m_gmem_count );
+  }
+
+  // ====================================================================================================================
+
+  template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
+  __device__ __forceinline__ 
+  void Hash_set<Key_type, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::store( int count, Key_type *keys )
+  {
+    int lane_id = utils::lane_id();
+    int lane_mask_lt = utils::lane_mask_lt();
+
+    int warp_offset = 0;
+    const int NUM_STEPS = SMEM_SIZE / WARP_SIZE;
+#pragma unroll
+    for( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step )
+      {
+	const int offset = i_step*WARP_SIZE + lane_id;
+	Key_type key = m_smem_keys[offset];
+	int poll = __ballot( key != -1 );
+	if( poll == 0 )
+	  continue;
+	int dst_offset = warp_offset + __popc( poll & lane_mask_lt );
+	if( key != -1 )
+	  keys[dst_offset] = key;
+	warp_offset += __popc( poll );
+      }
+
+    m_gmem_count = __any( m_gmem_count > 0 );
+    if( !m_gmem_count )
+      return;
+
+#pragma unroll 4
+    for( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE )
+      {
+	Key_type key = m_gmem_keys[offset];
+	int poll = __ballot( key != -1 );
+	if( poll == 0 )
+	  continue;
+	int dst_offset = warp_offset + __popc( poll & lane_mask_lt );
+	if( key != -1 )
+	  keys[dst_offset] = key;
+	warp_offset += __popc( poll );
+      }
+  }
+
+  // ====================================================================================================================
+
+  template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
+  __device__ __forceinline__ 
+  int Hash_set<Key_type, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::store_with_positions( Key_type *keys, int *pos )
+  {
+    int lane_id = utils::lane_id();
+    int lane_mask_lt = utils::lane_mask_lt();
+
+    int warp_offset = 0;
+    const int NUM_STEPS = SMEM_SIZE / WARP_SIZE;
+#pragma unroll
+    for( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step )
+      {
+	const int offset = i_step*WARP_SIZE + lane_id;
+	Key_type key = m_smem_keys[offset];
+	int poll = __ballot( key != -1 );
+	if( poll == 0 )
+	  continue;
+	int dst_offset = warp_offset + __popc( poll & lane_mask_lt );
+	if( key != -1 )
+	  {
+	    keys[dst_offset] = key;
+	    pos [dst_offset] = offset;
+	  }
+	warp_offset += __popc( poll );
+      }
+
+    m_gmem_count = __any( m_gmem_count > 0 );
+    if( !m_gmem_count )
+      return warp_offset;
+
+#pragma unroll 4
+    for( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE )
+      {
+	Key_type key = m_gmem_keys[offset];
+	int poll = __ballot( key != -1 );
+	if( poll == 0 )
+	  continue;
+	int dst_offset = warp_offset + __popc( poll & lane_mask_lt );
+	if( key != -1 )
+	  {
+	    keys[dst_offset] = key;
+	    pos [dst_offset] = SMEM_SIZE + offset;
+	  }
+	warp_offset += __popc( poll );
+      }
+    return warp_offset;
+  }
+
+
+  template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
+  __device__ __forceinline__ 
+  int Hash_set<Key_type, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::store( Key_type *keys )
+  {
+    int lane_id = utils::lane_id();
+    int lane_mask_lt = utils::lane_mask_lt();
+
+    int warp_offset = 0;
+    const int NUM_STEPS = SMEM_SIZE / WARP_SIZE;
+#pragma unroll
+    for( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step )
+      {
+	const int offset = i_step*WARP_SIZE + lane_id;
+	Key_type key = m_smem_keys[offset];
+	int poll = __ballot( key != -1 );
+	if( poll == 0 )
+	  continue;
+	int dst_offset = warp_offset + __popc( poll & lane_mask_lt );
+	if( key != -1 )
+	  {
+	    keys[dst_offset] = key;
+	  }
+	warp_offset += __popc( poll );
+      }
+
+    m_gmem_count = __any( m_gmem_count > 0 );
+    if( !m_gmem_count )
+      return warp_offset;
+
+#pragma unroll 4
+    for( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE )
+      {
+	Key_type key = m_gmem_keys[offset];
+	int poll = __ballot( key != -1 );
+	if( poll == 0 )
+	  continue;
+	int dst_offset = warp_offset + __popc( poll & lane_mask_lt );
+	if( key != -1 )
+	  {
+	    keys[dst_offset] = key;
+	  }
+	warp_offset += __popc( poll );
+      }
+    return warp_offset;
+  }
+
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  union Word { char b8[4]; int b32; };
+
+  // ====================================================================================================================
+
+  template< typename Key_type, typename T, int SMEM_SIZE=128, int NUM_HASH_FCTS=4, int WARP_SIZE=32 >
+  class Hash_map
+  {
+  protected:
+    // The keys stored in the map.
+    volatile Key_type *m_smem_keys, *m_gmem_keys; 
+    // Vote buffer for values.
+    volatile Word *m_smem_vote;
+    // Registers to store values.
+    T m_regs_vals[4];
+    // The values stored in the map.
+    T *m_gmem_vals;
+    // The size of the global memory buffer.
+    const int m_gmem_size;
+    // Is there any value in GMEM.
+    bool m_any_gmem;
+  
+  public:
+    // Constructor.
+    __device__ __forceinline__ 
+    Hash_map( volatile Key_type *smem_keys, volatile Key_type *gmem_keys, volatile Word *smem_vote, T *gmem_vals, int gmem_size ) :
+      m_smem_keys(smem_keys),
+      m_gmem_keys(gmem_keys),
+      m_smem_vote(smem_vote),
+      m_gmem_vals(gmem_vals),
+      m_gmem_size(gmem_size),
+      m_any_gmem (true)
+    {}
+  
+    // Clear the table. It doesn't clear GMEM values.
+    __device__ __forceinline__ void clear();
+    // Clear the table. It also clears GMEM values (set them to 0).
+    __device__ __forceinline__ void clear_all();
+    // Insert a key/value inside the hash table.
+    __device__ __forceinline__ void insert( Key_type key, T a_value, T b_value, int *status );
+    // Insert a key/value inside the hash table.
+    __device__ __forceinline__ void insert_with_duplicates( Key_type key, T val, int *status );
+    // Load a set.
+    __device__ __forceinline__ void load( int count, const Key_type *keys, const int *pos );
+    // Store the map.
+    __device__ __forceinline__ void store( int count, T *vals );
+    // Store the map.
+    __device__ __forceinline__ void store( int count, Key_type *keys, T *vals );
+    // Store the map.
+    __device__ __forceinline__ void store_map_keys_scale_values( int count, const int *map, Key_type *keys, T alpha, T *vals );
+    // Store the map.
+    __device__ __forceinline__ void store_keys_scale_values( int count, Key_type *keys, T alpha, T *vals );
+    // Update a value in the table but do not insert if it doesn't exist.
+    __device__ __forceinline__ bool update( Key_type key, T value );
+
+  protected:
+    // Get the selected item in the register buffer.
+    __device__ __forceinline__ int get_selected( int hash ) const 
+    { 
+      return static_cast<int>(m_smem_vote[hash%WARP_SIZE].b8[hash/WARP_SIZE]); 
+    }
+
+    // Is it the selected item in the register buffer.
+    __device__ __forceinline__ bool is_selected( int hash, int lane_id ) const 
+    { 
+      return m_smem_vote[hash%WARP_SIZE].b8[hash/WARP_SIZE] == reinterpret_cast<char&>(lane_id); 
+    }
+
+    // Push my ID in the register buffer.
+    __device__ __forceinline__ void try_selection( int hash, int lane_id ) 
+    { 
+      m_smem_vote[hash%WARP_SIZE].b8[hash/WARP_SIZE] = reinterpret_cast<char&>(lane_id); 
+    }
+  };
+
+  // ====================================================================================================================
+
+  template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
+  __device__ __forceinline__ 
+  void Hash_map<Key_type, T, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::clear()
+  {
+    int lane_id = utils::lane_id();
+
+    const int NUM_STEPS = SMEM_SIZE / WARP_SIZE;
+#pragma unroll
+    for( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step )
+      m_smem_keys[i_step*WARP_SIZE + lane_id] = -1;
+
+#pragma unroll
+    for( int i_regs = 0 ; i_regs < 4 ; ++i_regs )
+      m_regs_vals[i_regs] = T(0);
+
+    if( !m_any_gmem )
+      return;
+
+#pragma unroll 4
+    for( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE )
+      m_gmem_keys[offset] = -1;
+    m_any_gmem = false;
+  }
+
+  // ====================================================================================================================
+
+  template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
+  __device__ __forceinline__ 
+  void Hash_map<Key_type, T, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::clear_all()
+  {
+    int lane_id = utils::lane_id();
+
+    const int NUM_STEPS = SMEM_SIZE / WARP_SIZE;
+#pragma unroll
+    for( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step )
+      m_smem_keys[i_step*WARP_SIZE + lane_id] = -1;
+
+#pragma unroll
+    for( int i_regs = 0 ; i_regs < 4 ; ++i_regs )
+      m_regs_vals[i_regs] = T(0);
+
+    if( !m_any_gmem )
+      return;
+
+#pragma unroll 4
+    for( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE )
+      {
+		m_gmem_keys[offset] =   -1;
+		m_gmem_vals[offset] = T(0);
+      }
+    m_any_gmem = false;
+  }
+
+  // ====================================================================================================================
+
+  template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
+  __device__ __forceinline__ 
+  void Hash_map<Key_type, T, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::insert( Key_type key, T a_value, T b_value, int *status )
+  {
+    const int lane_id = utils::lane_id();
+    bool done = key == -1;
+
+    m_smem_vote[lane_id].b32 = 0x20202020;
+#pragma unroll
+    for( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash )
+      {
+	if( i_hash > 0 && __all(done) )
+	  break;
+	bool candidate = false;
+	unsigned ukey = reinterpret_cast<unsigned&>( key );
+	int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (SMEM_SIZE-1);
+	if( !done )
+	  {
+	    Key_type stored_key = m_smem_keys[hash];
+	    if( stored_key == key )
+	      {
+		this->try_selection( hash, lane_id );
+		done = true;
+	      }
+	    candidate = stored_key == -1;
+	    if( candidate )
+	      m_smem_keys[hash] = key;
+	    if( candidate && key == m_smem_keys[hash] )
+	      {
+		this->try_selection( hash, lane_id );
+		done = true;
+	      }
+	  }
+      }
+
+    Word my_vote;
+    my_vote.b32 = m_smem_vote[lane_id].b32;
+#pragma unroll
+    for( int i_regs = 0 ; i_regs < 4 ; ++i_regs )
+      {
+	int my_src = my_vote.b8[i_regs];
+	T other_val = utils::shfl( b_value, my_src );
+	if( my_src != WARP_SIZE ) 
+	  m_regs_vals[i_regs] = m_regs_vals[i_regs] + a_value * other_val;
+      }
+
+    const int num_bits = utils::bfind( m_gmem_size ); // TODO: move it outside ::insert.
+#pragma unroll
+    for( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash )
+      {
+	if( __all(done) )
+	  return;
+	m_any_gmem = true;
+	bool candidate = false;
+	unsigned ukey = reinterpret_cast<unsigned&>( key );
+	int hash = utils::bfe( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash], num_bits );
+	if( !done )
+	  {
+	    Key_type stored_key = m_gmem_keys[hash];
+	    if( stored_key == key )
+	      {
+		m_gmem_vals[hash] = m_gmem_vals[hash] + a_value * b_value;
+		done = true;
+	      }
+	    candidate = stored_key == -1;
+	    if( candidate )
+	      m_gmem_keys[hash] = key;
+	    if( candidate && key == m_gmem_keys[hash] ) // More than one candidate may have written to that slot.
+	      {
+		m_gmem_vals[hash] = a_value * b_value;
+		done = true;
+	      }
+	  }
+      }
+    if( status == NULL || __all(done) )
+      return;
+    if( lane_id == 0 )
+      status[0] = 1;
+  }
+
+  // ====================================================================================================================
+
+  template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
+  __device__ __forceinline__ 
+  void Hash_map<Key_type, T, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::insert_with_duplicates( Key_type key, T val, int *status )
+  {
+    const int lane_id = utils::lane_id();
+    bool done = key == -1;
+
+    m_smem_vote[lane_id].b32 = 0x20202020;
+#pragma unroll
+    for( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash )
+      {
+	if( __all(done) )
+	  break;
+	bool candidate = false;
+	bool maybe_in_conflict = false;
+	unsigned ukey = reinterpret_cast<unsigned&>( key );
+	int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (SMEM_SIZE-1);
+	if( !done )
+	  {
+	    Key_type stored_key = m_smem_keys[hash];
+	    if( stored_key == key )
+	      {
+		this->try_selection( hash, lane_id );
+		maybe_in_conflict = true;
+		done = true; // Is it really done???
+	      }
+	    candidate = stored_key == -1;
+	    if( candidate )
+	      m_smem_keys[hash] = key;
+	    if( candidate && key == m_smem_keys[hash] )
+	      {
+		this->try_selection( hash, lane_id );
+		maybe_in_conflict = true;
+		done = true;
+	      }
+	  }
+
+	// Fix conflicts.
+	bool in_conflict = maybe_in_conflict && !this->is_selected(hash, lane_id);
+	while( __any( in_conflict ) )
+	  {
+	    int winner = in_conflict ? this->get_selected(hash) : WARP_SIZE;
+	    T other_val = utils::shfl( val, winner );
+	    if( in_conflict )
+	      this->try_selection(hash, lane_id);
+	    if( in_conflict && this->is_selected(hash, lane_id) )
+	      {
+		val = val + other_val;
+		in_conflict = false;
+	      }
+	  }
+      }
+
+    Word my_vote;
+    my_vote.b32 = m_smem_vote[lane_id].b32;
+#pragma unroll
+    for( int i_regs = 0 ; i_regs < 4 ; ++i_regs )
+      {
+	int my_src = my_vote.b8[i_regs];
+	T other_val = utils::shfl( val, my_src );
+	if( my_src != WARP_SIZE ) 
+	  m_regs_vals[i_regs] = m_regs_vals[i_regs] + other_val;
+      }
+
+    const int num_bits = utils::bfind( m_gmem_size ); // TODO: move it outside ::insert.
+#pragma unroll
+    for( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash )
+      {
+	if( __all(done) )
+	  return;
+	m_any_gmem = true;
+	bool candidate = false;
+	unsigned ukey = reinterpret_cast<unsigned&>( key );
+	int hash = utils::bfe( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash], num_bits );
+	if( !done )
+	  {
+	    Key_type stored_key = m_gmem_keys[hash];
+	    if( stored_key == key )
+	      {
+		utils::atomic_add( &m_gmem_vals[hash], val );
+		done = true;
+	      }
+	    candidate = stored_key == -1;
+	    if( candidate )
+	      m_gmem_keys[hash] = key;
+	    if( candidate && key == m_gmem_keys[hash] ) // More than one candidate may have written to that slot.
+	      {
+		utils::atomic_add( &m_gmem_vals[hash], val );
+		done = true;
+	      }
+	  }
+      }
+    if( status == NULL || __all(done) )
+      return;
+    if( lane_id == 0 )
+      status[0] = 1;
+  }
+
+  // ====================================================================================================================
+
+  template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
+  __device__ __forceinline__ 
+  void Hash_map<Key_type, T, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::load( int count, const Key_type *keys, const int *pos )
+  {
+    int lane_id = utils::lane_id();
+
+#pragma unroll 4
+    for( int offset = lane_id ; offset < count ; offset += WARP_SIZE )
+      {
+	Key_type key = keys[offset];
+	int idx = pos [offset];
+
+	// Where to store the item.
+	volatile Key_type *ptr = m_smem_keys;
+	if( idx >= SMEM_SIZE )
+	  {
+	    ptr = m_gmem_keys;
+	    m_any_gmem = 1;
+	    idx -= SMEM_SIZE;
+	    m_gmem_vals[idx] = T(0);
+	  }
+
+	// Store the item.
+	ptr[idx] = key;
+      }
+    m_any_gmem = __any( m_any_gmem );
+  }
+
+  // ====================================================================================================================
+
+  template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
+  __device__ __forceinline__ 
+  void Hash_map<Key_type, T, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::store( int count, T *vals )
+  {
+    int lane_id = utils::lane_id();
+    int lane_mask_lt = utils::lane_mask_lt();
+
+    int warp_offset = 0;
+    const int NUM_STEPS = SMEM_SIZE / WARP_SIZE;
+#pragma unroll
+    for( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step )
+      {
+	const int offset = i_step*WARP_SIZE + lane_id;
+	Key_type key = m_smem_keys[offset];
+	int poll = __ballot( key != -1 );
+	if( poll == 0 )
+	  continue;
+	int dst_offset = warp_offset + __popc( poll & lane_mask_lt );
+	if( key != -1 )
+	  vals[dst_offset] = m_regs_vals[i_step];
+	warp_offset += __popc( poll );
+      }
+
+    if( !m_any_gmem )
+      return;
+
+#pragma unroll 4
+    for( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE )
+      {
+	Key_type key = m_gmem_keys[offset];
+	int poll = __ballot( key != -1 );
+	if( poll == 0 )
+	  continue;
+	int dst_offset = warp_offset + __popc( poll & lane_mask_lt );
+	if( key != -1 )
+	  vals[dst_offset] = m_gmem_vals[offset];
+	warp_offset += __popc( poll );
+      }
+  }
+
+  // ====================================================================================================================
+
+  template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
+  __device__ __forceinline__ 
+  void Hash_map<Key_type, T, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::store( int count, Key_type *keys, T *vals )
+  {
+    int lane_id = utils::lane_id();
+    int lane_mask_lt = utils::lane_mask_lt();
+
+    int warp_offset = 0;
+    const int NUM_STEPS = SMEM_SIZE / WARP_SIZE;
+#pragma unroll
+    for( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step )
+      {
+	const int offset = i_step*WARP_SIZE + lane_id;
+	Key_type key = m_smem_keys[offset];
+	int poll = __ballot( key != -1 );
+	if( poll == 0 )
+	  continue;
+	int dst_offset = warp_offset + __popc( poll & lane_mask_lt );
+	if( key != -1 )
+	  {
+	    keys[dst_offset] = key;
+	    vals[dst_offset] = m_regs_vals[i_step];
+	  }
+	warp_offset += __popc( poll );
+      }
+
+    if( !m_any_gmem )
+      return;
+
+#pragma unroll 4
+    for( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE )
+      {
+	Key_type key = m_gmem_keys[offset];
+	int poll = __ballot( key != -1 );
+	if( poll == 0 )
+	  continue;
+	int dst_offset = warp_offset + __popc( poll & lane_mask_lt );
+	if( key != -1 )
+	  {
+	    keys[dst_offset] = key;
+	    vals[dst_offset] = m_gmem_vals[offset];
+	  }
+	warp_offset += __popc( poll );
+      }
+  }
+
+  // ====================================================================================================================
+
+  template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
+  __device__ __forceinline__ 
+  void Hash_map<Key_type, T, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::store_map_keys_scale_values( int count, const int *map, Key_type *keys, T alpha, T *vals )
+  {
+    int lane_id = utils::lane_id();
+    int lane_mask_lt = utils::lane_mask_lt();
+
+    int warp_offset = 0;
+    const int NUM_STEPS = SMEM_SIZE / WARP_SIZE;
+#pragma unroll
+    for( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step )
+      {
+	const int offset = i_step*WARP_SIZE + lane_id;
+	Key_type key = m_smem_keys[offset];
+	int poll = __ballot( key != -1 );
+	if( poll == 0 )
+	  continue;
+	int dst_offset = warp_offset + __popc( poll & lane_mask_lt );
+	if( key != -1 )
+	  {
+	    keys[dst_offset] = map[key];
+	    vals[dst_offset] = alpha*m_regs_vals[i_step];
+	  }
+	warp_offset += __popc( poll );
+      }
+
+    if( !m_any_gmem )
+      return;
+
+#pragma unroll 4
+    for( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE )
+      {
+	Key_type key = m_gmem_keys[offset];
+	int poll = __ballot( key != -1 );
+	if( poll == 0 )
+	  continue;
+	int dst_offset = warp_offset + __popc( poll & lane_mask_lt );
+	if( key != -1 )
+	  {
+	    keys[dst_offset] = map[key];
+	    vals[dst_offset] = alpha*m_gmem_vals[offset];
+	  }
+	warp_offset += __popc( poll );
+      }
+  }
+
+  template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
+  __device__ __forceinline__ 
+  void Hash_map<Key_type, T, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::store_keys_scale_values( int count, Key_type *keys, T alpha, T *vals )
+  {
+    int lane_id = utils::lane_id();
+    int lane_mask_lt = utils::lane_mask_lt();
+
+    int warp_offset = 0;
+    const int NUM_STEPS = SMEM_SIZE / WARP_SIZE;
+#pragma unroll
+    for( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step )
+      {
+	const int offset = i_step*WARP_SIZE + lane_id;
+	Key_type key = m_smem_keys[offset];
+	int poll = __ballot( key != -1 );
+	if( poll == 0 )
+	  continue;
+	int dst_offset = warp_offset + __popc( poll & lane_mask_lt );
+	if( key != -1 )
+	  {
+	    keys[dst_offset] = key;
+	    vals[dst_offset] = alpha*m_regs_vals[i_step];
+	  }
+	warp_offset += __popc( poll );
+      }
+
+    if( !m_any_gmem )
+      return;
+
+#pragma unroll 4
+    for( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE )
+      {
+	Key_type key = m_gmem_keys[offset];
+	int poll = __ballot( key != -1 );
+	if( poll == 0 )
+	  continue;
+	int dst_offset = warp_offset + __popc( poll & lane_mask_lt );
+	if( key != -1 )
+	  {
+	    keys[dst_offset] = key;
+	    vals[dst_offset] = alpha*m_gmem_vals[offset];
+	  }
+	warp_offset += __popc( poll );
+      }
+  }
+
+
+
+  // ====================================================================================================================
+
+  template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE >
+  __device__ __forceinline__ 
+  bool Hash_map<Key_type, T, SMEM_SIZE, NUM_HASH_FCTS, WARP_SIZE>::update( Key_type key, T val )
+  {
+    const int lane_id = utils::lane_id();
+    bool done = key == -1, found = false;
+
+    m_smem_vote[lane_id].b32 = 0x20202020;
+#pragma unroll
+    for( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash )
+      {
+	if( i_hash > 0 && __all(done) )
+	  break;
+	unsigned ukey = reinterpret_cast<unsigned&>( key );
+	int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (SMEM_SIZE-1);
+	if( !done )
+	  {
+	    Key_type stored_key = m_smem_keys[hash];
+	    if( stored_key == key )
+	      {
+		this->try_selection( hash, lane_id );
+		found = true;
+	      }
+	    done = found || stored_key == -1;
+	  }
+      }
+
+    Word my_vote;
+    my_vote.b32 = m_smem_vote[lane_id].b32;
+#pragma unroll
+    for( int i_regs = 0 ; i_regs < 4 ; ++i_regs )
+      {
+	int my_src = my_vote.b8[i_regs];
+	T other_val = utils::shfl( val, my_src );
+	if( my_src != WARP_SIZE ) 
+	  m_regs_vals[i_regs] += other_val;
+      }
+
+    const int num_bits = utils::bfind( m_gmem_size ); // TODO: move it outside ::insert.
+#pragma unroll
+    for( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash )
+      {
+	if( __all(done) )
+	  return found;
+	unsigned ukey = reinterpret_cast<unsigned&>( key );
+	int hash = utils::bfe( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash], num_bits );
+	if( !done )
+	  {
+	    Key_type stored_key = m_gmem_keys[hash];
+	    if( stored_key == key )
+	      {
+		m_gmem_vals[hash] += val;
+		found = true;
+	      }
+	    done = found || stored_key == -1;
+	  }
+      }
+    return found;
+  }
+
+
+
+  
+  template<typename IndexT,
+	   typename Value_type, 
+	   typename Key_type=IndexT>
+  class Hash_Workspace
+  {
+  private:
+    // Do we need values on the GPU?
+    bool m_allocate_vals;
+    // Constant parameters.
+    const size_t m_grid_size, m_max_warp_count;
+    // The number of threads per row of B.
+    size_t m_num_threads_per_row_count, m_num_threads_per_row_compute;
+    // The size of the GMEM buffers (number of elements).
+    size_t m_gmem_size;
+    // The status: OK if count_non_zeroes succeeded, FAILED otherwise.
+    SHARED_PREFIX::shared_ptr<IndexT> m_status;
+    // The work queue for dynamic load balancing in the kernels.
+    SHARED_PREFIX::shared_ptr<IndexT> m_work_queue;
+    // The buffer to store keys in GMEM.
+    SHARED_PREFIX::shared_ptr<Key_type> m_keys;
+    // The buffer to store values in GMEM.
+    SHARED_PREFIX::shared_ptr<Value_type> m_vals;
+
+  public:
+    // Create a workspace.
+    Hash_Workspace( bool allocate_vals = true, 
+		    size_t grid_size = 128, 
+		    size_t max_warp_count = 8, 
+		    size_t gmem_size = 2048 ): 
+      m_allocate_vals(allocate_vals),
+      m_grid_size(grid_size), 
+      m_max_warp_count(max_warp_count), 
+      m_num_threads_per_row_count(32),
+      m_num_threads_per_row_compute(32),
+      m_gmem_size(gmem_size), 
+      m_status(allocateDevice<IndexT>(1, NULL)),
+      m_work_queue(allocateDevice<IndexT>(1, NULL))
+    {
+      allocate_workspace();
+    }
+
+    // Release memory used by the workspace.
+    virtual ~Hash_Workspace()
+    {
+      //purposely empty...
+    }
+
+    // Get the size of GMEM.
+    size_t get_gmem_size() const { return m_gmem_size; }
+    // Get the status flag.
+    IndexT* get_status() const { return m_status.get(); }
+    // Get the work queue.
+    IndexT* get_work_queue() const { return m_work_queue.get(); }
+    // Get the keys.
+    Key_type* get_keys() const { return m_keys.get(); }
+    // Get the values.
+    Value_type* get_vals() const { return m_vals.get(); }
+
+    // Expand the workspace.
+    void expand() { m_gmem_size *= 2; allocate_workspace(); }
+
+    // Define the number of threads per row of B.
+    void set_num_threads_per_row_count( size_t val ) { m_num_threads_per_row_count = val; }
+    // Define the number of threads per row of B.
+    void set_num_threads_per_row_compute( size_t val ) { m_num_threads_per_row_compute = val; }
+
+  protected:
+    // Allocate memory to store keys/vals in GMEM.
+    virtual void allocate_workspace(void)
+    {
+      const size_t NUM_WARPS_IN_GRID = m_grid_size * m_max_warp_count;
+      size_t sz = NUM_WARPS_IN_GRID*m_gmem_size*sizeof(Key_type);
+
+      m_keys = allocateDevice<Key_type>(sz, NULL);
+
+      if( m_allocate_vals )
+	{
+	  sz = NUM_WARPS_IN_GRID*m_gmem_size*sizeof(Value_type);
+	  m_vals = allocateDevice<Value_type>(sz, NULL);
+	}
+    }
+  };
+
+  namespace{ //unnamed...
+
+    static __device__ __forceinline__ int get_work( int *queue, int warp_id, int count = 1 )
+    {
+#if __CUDA_ARCH__ >= __CUDA_ARCH_THRESHOLD__
+      int offset = -1;
+      if( utils::lane_id() == 0 )
+        offset = atomicAdd( queue, count );
+      return __shfl( offset, 0 );
+#else
+      return 0;
+#endif
+    }
+
+    enum { WARP_SIZE = 32, GRID_SIZE = 128, SMEM_SIZE = 128 };
+
+    template<size_t NUM_THREADS_PER_ROW,
+             size_t CTA_SIZE,
+             size_t SMEM_SIZE,
+             size_t WARP_SIZE,
+             bool HAS_DIAG,
+             typename IndexT,
+             typename Value_type>
+      __global__ 
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= __CUDA_ARCH_THRESHOLD__  
+      __launch_bounds__( CTA_SIZE, 8 )
+#elif defined(__CUDA_ARCH__)
+      __launch_bounds__( CTA_SIZE, 6 )
+#endif
+      void fill_A_kernel_1x1( const size_t  R_num_rows,
+			      const IndexT *R_rows, 
+			      const IndexT *R_cols, 
+			      const IndexT *A_rows, 
+			      const IndexT *A_cols, 
+			      const IndexT *A_diag, 
+			      const Value_type *A_vals, 
+			      const IndexT *aggregates, 
+			      const IndexT *Ac_rows, 
+			      const IndexT *Ac_cols, 
+			      const IndexT *Ac_pos, 
+			      const IndexT *Ac_diag, 
+			      Value_type *Ac_vals, 
+			      size_t gmem_size, 
+			      IndexT *g_keys, 
+			      Value_type *g_vals, 
+			      IndexT *wk_work_queue )
+    {
+      const size_t NUM_WARPS = CTA_SIZE / WARP_SIZE;
+      const size_t NUM_LOADED_ROWS = WARP_SIZE / NUM_THREADS_PER_ROW;
+
+      // The hash keys stored in shared memory.
+      __shared__ volatile IndexT s_keys[NUM_WARPS*SMEM_SIZE]; 
+
+#if __CUDA_ARCH__ >= __CUDA_ARCH_THRESHOLD__
+      // The hash values stored in shared memory.
+      __shared__ volatile Word s_vote[NUM_WARPS*SMEM_SIZE/4]; 
+#else
+      // Shared memory to vote.
+      __shared__ volatile IndexT s_bcast_row[CTA_SIZE];
+      // The hash keys stored in shared memory.
+      __shared__ Value_type s_vals[NUM_WARPS*SMEM_SIZE]; 
+      // Shared memory to acquire work.
+      __shared__ volatile IndexT s_offsets[NUM_WARPS];
+      // Shared memory to reduce the diagonal.
+      __shared__ volatile Value_type s_diag[CTA_SIZE];
+#endif
+  
+      // The coordinates of the thread inside the CTA/warp.
+      const IndexT warp_id = utils::warp_id(); 
+      const IndexT lane_id = utils::lane_id();
+
+      // Constants.
+      const size_t lane_id_div_num_threads = lane_id / NUM_THREADS_PER_ROW;
+      const size_t lane_id_mod_num_threads = lane_id % NUM_THREADS_PER_ROW;
+
+      // First threads load the row IDs of A needed by the CTA...
+      IndexT r_row_id = blockIdx.x*NUM_WARPS + warp_id;
+  
+      // Create local storage for the set.
+#if __CUDA_ARCH__ >= __CUDA_ARCH_THRESHOLD__
+      Hash_map<IndexT, Value_type, SMEM_SIZE, 4, WARP_SIZE> map( &s_keys[warp_id*SMEM_SIZE  ], 
+							      &g_keys[r_row_id*gmem_size ], 
+							      &s_vote[warp_id*SMEM_SIZE/4], 
+							      &g_vals[r_row_id*gmem_size ], gmem_size );
+#else
+      Hash_map<IndexT, Value_type, SMEM_SIZE, 4, WARP_SIZE> map( &s_keys[warp_id*SMEM_SIZE ], 
+							      &g_keys[r_row_id*gmem_size], 
+							      &s_vals[warp_id*SMEM_SIZE ], 
+							      &g_vals[r_row_id*gmem_size], gmem_size );
+#endif
+  
+    // Loop over rows of A.
+#if __CUDA_ARCH__ >= __CUDA_ARCH_THRESHOLD__
+      for( ; r_row_id < R_num_rows ; r_row_id = get_work( wk_work_queue, warp_id ) )
+#else
+      for( ; r_row_id < R_num_rows ; r_row_id = get_work( s_offsets, wk_work_queue, warp_id ) )
+#endif
+	{
+	  // The indices of the output row.
+	  IndexT ac_col_it  = Ac_rows[r_row_id+0];
+	  IndexT ac_col_end = Ac_rows[r_row_id+1];
+
+	  // Clear the set first. TODO: Make sure it's needed. I don't think it is!!!!
+	  map.clear();
+	  // Populate the map.
+	  map.load( ac_col_end-ac_col_it, &Ac_cols[ac_col_it], &Ac_pos[ac_col_it] );
+
+	  // Load the range of the row. TODO: Make sure it helps.
+	  IndexT r_col_it  = R_rows[r_row_id + 0];
+	  IndexT r_col_end = R_rows[r_row_id + 1];
+
+	  // The diagonal.
+	  Value_type r_diag(0);
+
+	  // _iterate over the columns of A to build C_hat.
+	  for( r_col_it += lane_id ; __any(r_col_it < r_col_end) ; r_col_it += WARP_SIZE )
+	    {
+	      // Is it an active thread.
+	      const bool is_active = r_col_it < r_col_end;
+    
+	      // Columns of A maps to rows of B. Each thread of the warp loads its A-col/B-row ID.
+	      IndexT a_row_id = -1; 
+	      if( is_active )
+		a_row_id = R_cols[r_col_it];
+#if __CUDA_ARCH__ < __CUDA_ARCH_THRESHOLD__
+	      s_bcast_row[threadIdx.x] = a_row_id;
+#endif
+
+	      // Update the diagonal (if needed). 
+	      if( HAS_DIAG && is_active )
+			r_diag = r_diag + A_vals[A_diag[a_row_id]];
+	      
+	      const size_t num_rows = __popc( __ballot(is_active) );
+
+	      // Uniform loop: threads collaborate to load other elements.  
+	      for( IndexT k = 0 ; k < num_rows ; k += NUM_LOADED_ROWS )
+			{
+			  IndexT local_k = k+lane_id_div_num_threads;
+
+		  // Threads in the warp proceeds columns of B in the range [bColIt, bColEnd).
+#if __CUDA_ARCH__ >= __CUDA_ARCH_THRESHOLD__
+			  const IndexT uniform_a_row_id = __shfl( a_row_id, local_k );
+#else
+			  IndexT uniform_a_row_id = -1;
+			  if( local_k < num_rows )
+				uniform_a_row_id = s_bcast_row[warp_id*WARP_SIZE + local_k];
+#endif
+
+			  // The range of the row of B.
+			  IndexT a_col_it = 0, a_col_end = 0;
+			  if( local_k < num_rows )
+				{
+				  a_col_it  = utils::Ld<utils::LD_CG>::load( &A_rows[uniform_a_row_id + 0] );
+				  a_col_end = utils::Ld<utils::LD_CG>::load( &A_rows[uniform_a_row_id + 1] );
+				}
+        
+			  // Iterate over the range of columns of B.
+			  for( a_col_it += lane_id_mod_num_threads ; __any(a_col_it < a_col_end) ; a_col_it += NUM_THREADS_PER_ROW )
+				{
+				  // Load columns and values.
+				  IndexT a_col_id = -1; Value_type a_value(Value_type(0));
+				  if( a_col_it < a_col_end )
+					{
+					  a_col_id = A_cols[a_col_it];
+					  a_value  = A_vals[a_col_it];
+					}
+
+				  // Find the aggregate.
+				  IndexT a_agg_id = -1;
+				  if( a_col_it < a_col_end )
+					a_agg_id = aggregates[a_col_id];
+
+
+				  // Update the diag/hash map.
+				  if( HAS_DIAG && a_agg_id == r_row_id )
+					{
+					  r_diag = r_diag + a_value;
+					  a_agg_id = -1;
+					}
+
+				  map.insert_with_duplicates( a_agg_id, a_value, NULL );  // It won't insert. Only update.
+				}
+			}
+	    }
+
+	  // Update the diagonal.
+	  if( HAS_DIAG )
+	    {
+#if __CUDA_ARCH__ >= __CUDA_ARCH_THRESHOLD__
+	      r_diag = utils::warp_reduce<1, utils::Add>( r_diag );
+#else
+		  utils::util<Value_type>::volcast(r_diag, s_diag + threadIdx.x);
+#ifdef _MSC_VER
+	      r_diag = utils::warp_reduce_sum<1, Value_type, 32>(s_diag, r_diag);
+#else
+	      r_diag = utils::warp_reduce<1, utils::Add>(s_diag, r_diag);
+#endif
+#endif
+	      if( lane_id == 0 )
+			Ac_vals[Ac_diag[r_row_id]] = r_diag;
+	    }
+
+	  // Store the results.
+	  IndexT count = ac_col_end - ac_col_it;
+	  if( count == 0 )
+	    continue;
+	  map.store( count, &Ac_vals[ac_col_it] );
+	}
+    }
+
+    template< size_t CTA_SIZE,
+              typename Workspace,
+              typename IndexT,
+              typename Value_type>
+      void fill_A_dispatch( Workspace &hash_wk, 
+                            const size_t  R_num_rows, // same as num_aggregates.
+							const IndexT *R_rows,
+							const IndexT *R_cols,
+							const IndexT *A_rows,
+							const IndexT *A_cols,
+							const Value_type *A_vals,
+							const IndexT *aggregates,
+							const IndexT *Ac_rows, 
+							const IndexT *Ac_cols, 
+							const IndexT *Ac_pos,
+							Value_type *Ac_vals )
+    {
+      const size_t NUM_WARPS = CTA_SIZE / WARP_SIZE;
+      cudaStream_t stream = 0; // for now...
+
+      size_t work_offset = GRID_SIZE*NUM_WARPS;
+      cudaMemcpyAsync( hash_wk.get_work_queue(), &work_offset, sizeof(IndexT), cudaMemcpyHostToDevice, stream );
+      cudaCheckError();
+
+      fill_A_kernel_1x1<8, CTA_SIZE, SMEM_SIZE, 32, false><<<GRID_SIZE, CTA_SIZE>>>( 
+												       R_num_rows, 
+												       R_rows, 
+												       R_cols, 
+												       A_rows, 
+												       A_cols, 
+												       static_cast<IndexT*>(0), 
+												       A_vals, 
+												       aggregates, 
+												       Ac_rows, 
+												       Ac_cols, 
+												       Ac_pos,
+												       static_cast<IndexT*>(0), 
+												       Ac_vals, 
+												       hash_wk.get_gmem_size(),
+												       hash_wk.get_keys(),
+												       hash_wk.get_vals(),
+												       hash_wk.get_work_queue() );
+
+     
+      cudaCheckError();
+    }
+
+    template<size_t NUM_THREADS_PER_ROW,
+             size_t CTA_SIZE,
+             size_t SMEM_SIZE,
+             size_t WARP_SIZE,
+             bool HAS_DIAG,
+             bool COUNT_ONLY,
+             typename IndexT>
+    __global__ __launch_bounds__( CTA_SIZE )
+    void compute_sparsity_kernel( const size_t  R_num_rows, // same as num_aggregates.
+                                  const IndexT *R_rows,
+                                  const IndexT *R_cols,
+                                  const IndexT *A_rows,
+                                  const IndexT *A_cols,
+                                  const IndexT *aggregates,
+                                  IndexT *Ac_rows, 
+                                  IndexT *Ac_cols,
+                                  IndexT *Ac_pos,
+                                  const size_t gmem_size,
+                                  IndexT *g_keys, 
+                                  IndexT *wk_work_queue, 
+                                  IndexT *wk_status )
+    {
+      const size_t NUM_WARPS       = CTA_SIZE  / WARP_SIZE;
+      const size_t NUM_LOADED_ROWS = WARP_SIZE / NUM_THREADS_PER_ROW;
+
+      // The hash keys stored in shared memory.
+      __shared__ IndexT s_keys[NUM_WARPS*SMEM_SIZE]; 
+
+#if __CUDA_ARCH__ < __CUDA_ARCH_THRESHOLD__
+      // Shared memory to acquire work.
+      __shared__ volatile IndexT s_offsets[NUM_WARPS];
+      // Shared memory to vote.
+      __shared__ volatile IndexT s_bcast_cols[CTA_SIZE];
+#endif
+
+      // The coordinates of the thread inside the CTA/warp.
+      const IndexT warp_id = utils::warp_id(); 
+      const IndexT lane_id = utils::lane_id();
+
+      printf("###### milestone 1\n");
+
+      // Constants.
+      const IndexT lane_id_div_num_threads = lane_id / NUM_THREADS_PER_ROW;
+      const IndexT lane_id_mod_num_threads = lane_id % NUM_THREADS_PER_ROW;
+
+      // First threads load the row IDs of A needed by the CTA...
+      IndexT r_row_id = blockIdx.x*NUM_WARPS + warp_id;
+  
+      // Create local storage for the set.
+#if __CUDA_ARCH__ >= __CUDA_ARCH_THRESHOLD__
+      Hash_set<IndexT, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id*SMEM_SIZE], &g_keys[r_row_id*gmem_size], gmem_size );
+#else
+      Hash_set<IndexT, SMEM_SIZE, 4, WARP_SIZE> set( &s_keys[warp_id*SMEM_SIZE], &g_keys[r_row_id*gmem_size], gmem_size );
+#endif
+
+      printf("###### milestone 2\n");
+  
+      // Loop over rows of R.
+// #if __CUDA_ARCH__ >= __CUDA_ARCH_THRESHOLD__
+      for( ; r_row_id < R_num_rows ; r_row_id = get_work( wk_work_queue, warp_id ) )
+// #else
+// 	for( ; r_row_id < R_num_rows ; r_row_id = get_work( s_offsets, wk_work_queue, warp_id ) )
+// #endif
+	  {
+	    // Make sure we have to proceed.
+	    if( COUNT_ONLY )
+	      {
+		volatile IndexT *status = reinterpret_cast<volatile IndexT*>( wk_status );
+		if( set.has_failed() || *status != 0 )
+		  return;
+	      }
+    
+	    // Clear the set.
+	    set.clear();
+
+	    // Load the range of the row.
+	    IndexT r_col_it  = R_rows[r_row_id + 0];
+	    IndexT r_col_end = R_rows[r_row_id + 1];
+
+        printf("###### milestone 3\n");
+    
+	    // Iterate over the columns of R.
+	    for( r_col_it += lane_id ; __any(r_col_it < r_col_end) ; r_col_it += WARP_SIZE )
+	      {
+		// Is it an active thread.
+		const bool is_active = r_col_it < r_col_end;
+    
+		// Columns of R map to rows of A. Each thread of the warp loads its R-col/A-row ID.
+		IndexT a_row_id = -1;
+		if( is_active ) 
+		  a_row_id = R_cols[r_col_it];
+#if __CUDA_ARCH__ < __CUDA_ARCH_THRESHOLD__
+		s_bcast_cols[threadIdx.x] = a_row_id;
+#endif
+		const size_t num_rows = __popc( __ballot(is_active) );
+
+         printf("###### milestone 4\n");
+
+		// Uniform loop: threads collaborate to load other elements.  
+		for( IndexT k = 0 ; k < num_rows ; k += NUM_LOADED_ROWS )
+		  {
+		    IndexT local_k = k+lane_id_div_num_threads;
+		    // Is it an active thread.
+		    bool is_active_k = local_k < num_rows;
+
+		    // Threads in the warp proceeds columns of B in the range [bColIt, bColEnd).
+#if __CUDA_ARCH__ >= __CUDA_ARCH_THRESHOLD__
+		    const IndexT uniform_a_row_id = __shfl( a_row_id, local_k );
+#else
+		    IndexT uniform_a_row_id = -1;
+		    if( is_active_k )
+		      uniform_a_row_id = s_bcast_cols[warp_id*WARP_SIZE + local_k];
+#endif
+
+            printf("###### milestone 5\n");
+             
+		    // Load the range of the row of B.
+		    IndexT a_col_it = 0, a_col_end = 0;
+		    if( is_active_k )
+		      {
+			a_col_it  = A_rows[uniform_a_row_id + 0];
+			a_col_end = A_rows[uniform_a_row_id + 1];
+		      }
+        
+		    // Iterate over the range of columns of B.
+		    for( a_col_it += lane_id_mod_num_threads ; __any(a_col_it < a_col_end) ; a_col_it += NUM_THREADS_PER_ROW )
+		      {
+			IndexT a_col_id = -1, a_agg_id = -1;
+			if( a_col_it < a_col_end )
+			  {
+			    a_col_id = A_cols[a_col_it];
+			    a_agg_id = aggregates[a_col_id];
+			  }
+			//if( a_agg_id >= R_num_rows )
+			//  printf( "Out of range aggregate!!!\n" );
+			if( HAS_DIAG && a_agg_id == r_row_id )
+			  a_agg_id = -1;
+			set.insert( a_agg_id, COUNT_ONLY ? wk_status : NULL );
+		      }
+		  }
+	      }
+
+        printf("###### milestone 6\n");
+
+	    // Store the results.
+	    if( COUNT_ONLY )
+	      {
+		IndexT count = set.compute_size_with_duplicates();
+		if( lane_id == 0 ) 
+		  Ac_rows[r_row_id] = count;
+	      }
+	    else
+	      {
+		IndexT ac_col_it = Ac_rows[r_row_id];
+		set.store_with_positions( &Ac_cols[ac_col_it], &Ac_pos[ac_col_it] );
+	      }
+	  }
+    }
+
+   
+
+    template< size_t CTA_SIZE, 
+			  bool HAS_DIAG, 
+			  bool COUNT_ONLY,
+              typename Workspace,
+			  typename IndexT>
+	  void compute_sparsity_dispatch( Workspace &hash_wk, 
+									  const size_t  R_num_rows, 
+									  const IndexT *R_rows, 
+									  const IndexT *R_cols, 
+									  const IndexT *A_rows, 
+									  const IndexT *A_cols,
+									  const IndexT *aggregates, 
+									  IndexT *Ac_rows, 
+									  IndexT *Ac_cols, 
+									  IndexT *Ac_pos )
+    {
+      const size_t NUM_WARPS = CTA_SIZE / WARP_SIZE;
+
+      //AMGX uses pool allocator thrust::global_thread_handle::cudaMallocHost(), here...
+      //
+      SHARED_PREFIX::shared_ptr<IndexT> h_status(new IndexT);
+      SHARED_PREFIX::shared_ptr<IndexT> h_work_offset(new IndexT);
+
+      cudaStream_t stream = 0; // for now...
+
+      int attempt = 0;
+      for( bool done = false ; !done && attempt < 10 ; ++attempt )
+		{
+		  // Double the amount of GMEM (if needed).
+		  if( attempt > 0 )
+			{
+			  std::cerr << "LOW_DEG: Requires " << hash_wk.get_gmem_size() << " items per warp!!!" << std::endl;
+			  hash_wk.expand();
+			}
+
+		  // Reset the status.
+		  IndexT *p_status = h_status.get();
+		  *p_status = 0;
+		  cudaMemcpyAsync( hash_wk.get_status(), p_status, sizeof(IndexT), cudaMemcpyHostToDevice, stream );
+		  cudaCheckError();
+
+		  // Reset the work queue.
+		  IndexT *p_work_offset = h_work_offset.get();
+		  *p_work_offset = GRID_SIZE*NUM_WARPS;
+		  cudaMemcpyAsync( hash_wk.get_work_queue(), p_work_offset, sizeof(IndexT), cudaMemcpyHostToDevice, stream );
+		  cudaCheckError();
+
+		  // Launch the kernel.
+		  compute_sparsity_kernel<8, CTA_SIZE, SMEM_SIZE, WARP_SIZE, HAS_DIAG, COUNT_ONLY><<<GRID_SIZE, CTA_SIZE,0,stream>>>(R_num_rows, R_rows, R_cols, A_rows, A_cols, aggregates, Ac_rows, Ac_cols, Ac_pos, hash_wk.get_gmem_size(), hash_wk.get_keys(), hash_wk.get_work_queue(), hash_wk.get_status() );
+
+		  cudaCheckError();
+  
+		  // Read the result from count_non_zeroes.
+		  cudaMemcpyAsync( p_status, hash_wk.get_status(), sizeof(IndexT), cudaMemcpyDeviceToHost, stream ); 
+		  cudaStreamSynchronize(stream); 
+		  done = (*p_status == 0);
+
+		  cudaCheckError();
+		}
+    }
+  }//end unnamed namespace
+
+}//nvgraph namespace
+
+#endif
diff --git a/cpp/nvgraph/cpp/include/graph_contracting_visitor.hxx b/cpp/nvgraph/cpp/include/graph_contracting_visitor.hxx
new file mode 100644
index 00000000000..e958a27ed0c
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/graph_contracting_visitor.hxx
@@ -0,0 +1,1699 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GRAPH_CONTRACTING_VISITOR_HXX
+#define GRAPH_CONTRACTING_VISITOR_HXX
+//
+//
+
+#include <multi_valued_csr_graph.hxx> //which includes all other headers... 
+#include <range_view.hxx> // TODO: to be changed to thrust/range_view.h, when toolkit gets in sync with Thrust
+#include <thrust_traits.hxx>
+///#include <graph_contracting_structs.hxx>
+#include <cassert>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/binary_search.h>
+#include <thrust/sort.h>//
+#include <thrust/extrema.h>
+#include <thrust/pair.h>
+#include <thrust/distance.h>//
+#include <thrust/unique.h>//
+
+#include <cusp/array1d.h>
+#include <cusp/array2d.h>
+#include <cusp/functional.h>
+#include <cusp/multiply.h>
+#include <cusp/print.h>
+#include <cusp/transpose.h>//
+
+//debugging only:
+#include <cstdio>
+
+#define __CUDA_ARCH_THRESHOLD__ 300
+///#define __CUDA_ARCH_THRESHOLD__ 350
+//
+namespace nvgraph
+{
+  
+
+
+  //SpMv + SpMM + SpMM:
+  //  cntrctd_vertex_data = S*v(g_vertex_data);
+  //  cntrctd_edge_data   = (S*G(g_edge_data)*St).values
+  //
+  //see GraphContractionFunctor::computeRestrictionOperator() for S matrix CSR data
+  //
+  template<typename VectorI,            //vector type for indices
+           typename VectorV,            //vector type for values 
+           typename VWrapper,           //wrapper type around raw pointer or other type of array wrapper
+           typename VertexCombineFctr,  //vertex "multiplication" functor type
+           typename VertexReduceFctr,   //vertex "addition" functor type
+           typename EdgeCombineFctr,    //edge "multiplication" functor type
+           typename EdgeReduceFctr>     //edge "addition" functor type
+  struct SemiringContractionUtilities
+  {
+    typedef typename VectorI::value_type IndexT;
+    typedef typename VectorV::value_type ValT;
+
+  typedef typename VectorPtrT<typename VectorI::value_type,VectorI>::PtrT PtrI;
+  typedef typename VectorPtrT<typename VectorV::value_type,VectorV>::PtrT PtrV;
+
+    SemiringContractionUtilities(const VectorI& g_row_offsets, //original graph CSR 
+                                 const VectorI& g_col_indices,
+                                 const VectorI& S_row_offsets,
+                                 const VectorI& S_col_indices,
+                                 VertexCombineFctr& v_combine,
+                                 VertexReduceFctr&  v_reduce,
+                                 EdgeCombineFctr&   e_combine,
+                                 EdgeReduceFctr&    e_reduce):
+      m_g_row_offsets(g_row_offsets),
+      m_g_col_indices(g_col_indices),
+      m_v_combine(v_combine),
+      m_v_reduce(v_reduce),
+      m_e_combine(e_combine),
+      m_e_reduce(e_reduce),
+      m_n_agg(S_row_offsets.size()-1),
+      m_g_nr(g_row_offsets.size()-1), // == S_nc
+      m_g_nnz(g_row_offsets.back()),
+      m_s_nnz(S_row_offsets.back())
+    { 
+      VectorV S_vals(m_s_nnz, 1);
+
+      PtrV p_S_vals(S_vals.data().get());
+      VWrapper S_vals_w(p_S_vals, p_S_vals+S_vals.size());
+
+      //NOT necessarily square!
+      m_S = make_csr_matrix(m_g_nr, S_row_offsets, S_col_indices, S_vals_w);
+
+      m_St = cusp::csr_matrix<IndexT, ValT, cusp::device_memory>(m_g_nr, m_n_agg, m_s_nnz);
+      cusp::transpose(m_S, m_St);
+      cudaCheckError();
+    }
+
+    virtual ~SemiringContractionUtilities(void)
+    {
+    }
+
+    const VectorI& get_row_ptr(void) const
+    {
+      return m_cntrctd_row_offsets;
+    }
+    
+    const VectorI& get_col_ind(void) const
+    {
+      return m_cntrctd_col_indices;
+    }
+
+    IndexT get_subg_nnz(void) const
+    {
+      return m_cntrctd_row_offsets.back();
+    }
+
+    virtual void update_vertex_data(/*In: */const VWrapper& g_vertex_data,//multivalue vertex entry of original graph, size==g_nr
+                                    /*Out:*/VWrapper& cntrctd_vertex_data)//multivalue vertex entry of contracted graph, size==n_agg==S_nr (assumed allocated!)
+    {
+      //SpMv:
+      //
+      assert( m_g_nr == g_vertex_data.size() );
+      cusp::array1d<ValT, cusp::device_memory> x(g_vertex_data.cbegin(), g_vertex_data.cend());
+      cusp::array1d<ValT, cusp::device_memory> y(m_n_agg,0);
+
+      cusp::constant_functor<ValT> initialize;
+      cusp::multiply(m_S, x, y, initialize, m_v_combine, m_v_reduce);
+      cudaCheckError();
+
+      thrust::copy(y.begin(), y.end(), cntrctd_vertex_data.begin());
+      cudaCheckError();
+    }
+
+    virtual void update_topology_only(void)
+    {
+      cudaCheckError();
+      //SpMM+SpMM: S*G*St
+      //
+      VectorV empty(m_g_nnz, 1);//0 => empty G matrix, use 1's as values
+
+      PtrV ptr_e(&empty[0]);
+      VWrapper g_edge_data(ptr_e, ptr_e+m_g_nnz);
+      cudaCheckError();
+      
+      cusp::csr_matrix<IndexT, ValT, cusp::device_memory> G =
+        make_square_csr_matrix(m_g_row_offsets, m_g_col_indices, g_edge_data);
+      cudaCheckError();
+
+      cusp::constant_functor<ValT> initialize;
+
+      //L=S*G
+      cusp::csr_matrix<IndexT, ValT, cusp::device_memory> L;//no need to allocate!
+      cusp::multiply(m_S, G, L, initialize, m_e_combine, m_e_reduce);
+      cudaCheckError();
+
+      //R = L*St
+      cusp::csr_matrix<IndexT, ValT, cusp::device_memory> R;//no need to allocate!
+      cusp::multiply(L, m_St, R, initialize, m_e_combine, m_e_reduce);
+      cudaCheckError();
+
+      //##### debug:
+      //std::cout<<"S:\n";cusp::print(m_S);
+      //std::cout<<"R:\n";cusp::print(R);
+
+      size_t r_sz = R.row_offsets.size();
+      assert( r_sz > 0 );
+  
+      size_t cntrctd_nnz = R.row_offsets.back();
+      ///size_t cntrctd_nr = r_sz-1;
+
+      //allocate cntrctd_csr_data:
+      m_cntrctd_row_offsets = VectorI(r_sz, 0);
+      m_cntrctd_col_indices = VectorI(cntrctd_nnz, 0);
+
+      thrust::copy(R.row_offsets.begin(), R.row_offsets.end(), m_cntrctd_row_offsets.begin());
+      cudaCheckError();
+      thrust::copy(R.column_indices.begin(), R.column_indices.end(), m_cntrctd_col_indices.begin());
+      cudaCheckError();
+    }
+
+    virtual void update_edge_data(/*In: */const VWrapper& g_edge_data,  //multivalue edge entry of original graph, size==g_nnz
+                                  /*Out:*/VWrapper& cntrctd_edge_data)  //multivalue edge entry of contracted graph, size==nnz(S*G*St) (assumed allocated!)
+    {
+      //SpMM+SpMM: S*G*St
+      //
+      assert( m_g_nnz == g_edge_data.size() );
+      cusp::csr_matrix<IndexT, ValT, cusp::device_memory> G =
+        make_square_csr_matrix(m_g_row_offsets, m_g_col_indices, g_edge_data);
+      cudaCheckError();
+
+      cusp::constant_functor<ValT> initialize;
+      cudaCheckError();
+
+      //L=S*G
+      cusp::csr_matrix<IndexT, ValT, cusp::device_memory> L;//no need to allocate!
+      cusp::multiply(m_S, G, L, initialize, m_e_combine, m_e_reduce);
+      cudaCheckError();
+
+      //R = L*St //##### crash here:
+      cusp::csr_matrix<IndexT, ValT, cusp::device_memory> R;//no need to allocate!
+      cusp::multiply(L, m_St, R, initialize, m_e_combine, m_e_reduce);
+      cudaCheckError();
+
+      size_t r_sz = R.row_offsets.size();
+      assert( r_sz > 0 );
+  
+      size_t cntrctd_nnz = R.row_offsets.back();
+      ///size_t cntrctd_nr = r_sz-1;
+
+      //allocate cntrctd_csr_data:
+      m_cntrctd_row_offsets = VectorI(r_sz, 0);
+      m_cntrctd_col_indices = VectorI(cntrctd_nnz, 0);
+
+      thrust::copy(R.row_offsets.begin(), R.row_offsets.end(), m_cntrctd_row_offsets.begin());
+      cudaCheckError();
+      
+      thrust::copy(R.column_indices.begin(), R.column_indices.end(), m_cntrctd_col_indices.begin());
+      cudaCheckError();
+      
+      thrust::copy(R.values.begin(), R.values.end(), cntrctd_edge_data.begin());
+      cudaCheckError();
+    }
+
+    virtual void update_all(/*In: */const VWrapper& g_vertex_data,//multivalue vertex entry of original graph, size==g_nr
+                            /*Out:*/VWrapper& cntrctd_vertex_data,//multivalue vertex entry of contracted graph, size==n_agg==S_nr (assumed allocated!)
+                            /*In: */const VWrapper& g_edge_data,  //multivalue edge entry of original graph, size==g_nnz
+                            /*Out:*/VWrapper& cntrctd_edge_data)  //multivalue edge entry of contracted graph, size==nnz(S*G*St) (assumed allocated!)
+    {
+      update_vertex_data(g_vertex_data, cntrctd_vertex_data);
+      update_edge_data(g_edge_data, cntrctd_edge_data);
+    }
+
+  protected:
+    static cusp::csr_matrix<IndexT,
+                            ValT,
+                            cusp::device_memory>
+    make_csr_matrix(size_t nc,
+                    const VectorI& row_offsets,
+                    const VectorI& col_indices,
+                    const VWrapper& vals)
+    {
+      size_t nr  = row_offsets.size()-1;
+      size_t nz = row_offsets.back();
+
+      cusp::csr_matrix<IndexT, ValT, cusp::device_memory> A(nr, nc, nz);
+
+      //copy:
+      //
+      A.row_offsets    = row_offsets;
+      A.column_indices = col_indices;
+
+      thrust::copy(vals.cbegin(), vals.cend(), A.values.begin());
+      cudaCheckError();
+
+      return A;
+    }
+
+    static cusp::csr_matrix<IndexT,
+                            ValT,
+                            cusp::device_memory>
+    make_square_csr_matrix(const VectorI& row_offsets,
+                           const VectorI& col_indices,
+                           const VWrapper& vals)
+    {
+      size_t nc  = row_offsets.size()-1;
+
+      return make_csr_matrix(nc, row_offsets, col_indices, vals);
+    }
+    
+  private:
+    //Input:
+    //
+    const VectorI& m_g_row_offsets; //original graph CSR data:
+    const VectorI& m_g_col_indices;
+    cusp::csr_matrix<IndexT, ValT, cusp::device_memory> m_S; //aggreagate matrix
+    cusp::csr_matrix<IndexT, ValT, cusp::device_memory> m_St; //aggreagate matrix transpose
+
+    //Output:
+    //
+    VectorI m_cntrctd_row_offsets;  //contracted graph CSR data:
+    VectorI m_cntrctd_col_indices;
+
+    //I/O:
+    //
+    VertexCombineFctr& m_v_combine; //vertex "multiplication" functor
+    VertexReduceFctr&  m_v_reduce;  //vertex "addition" functor
+    EdgeCombineFctr& m_e_combine;   //edge "multiplication" functor
+    EdgeReduceFctr& m_e_reduce;     //edge "addition" functor
+
+    const size_t m_n_agg;
+    const size_t m_g_nr; // == S_nc
+    const size_t m_g_nnz;
+    const size_t m_s_nnz;
+    
+  };
+
+  //generic value updater
+  //
+  template<typename VectorV,            //Vector of values
+       typename VectorI,            //Vector of indices
+           typename VertexCombineFctr,  //vertex "multiplication" functor type
+           typename VertexReduceFctr,   //vertex "addition" functor type
+           typename EdgeCombineFctr,    //edge "multiplication" functor type
+           typename EdgeReduceFctr,     //edge "addition" functor type
+       size_t CTA_SIZE>             //only used by the specialized template
+  struct ContractionValueUpdater
+  {
+    typedef typename VectorI::value_type IndexT;
+    //typedef typename VectorPtrT<typename VectorI::value_type,VectorV>::PtrT PtrI;
+
+    typedef typename VectorV::value_type ValueT;
+    typedef typename VectorPtrT<typename VectorV::value_type,VectorV>::PtrT PtrV;
+
+  //TODO: make template argument:
+    typedef range_view<PtrV> VWrapper;
+
+    //v_src, v_dest assumed pre-allocated!
+    //
+    ContractionValueUpdater(/*const */VectorV& v_src,
+              VectorV& v_dest,
+                            VertexCombineFctr& v_combine,
+                            VertexReduceFctr&  v_reduce,
+                            EdgeCombineFctr&   e_combine,
+                            EdgeReduceFctr&    e_reduce):
+      v_s_(v_src),
+      v_d_(v_dest),
+      m_v_combine(v_combine),
+      m_v_reduce(v_reduce),
+      m_e_combine(e_combine),
+      m_e_reduce(e_reduce)
+    {
+    }
+
+    //TODO: more efficient solution with VWrapper, to avoid device memory traffic
+    //
+    void update_from(///Hash_Workspace<IndexT,ValueT>& hash_wk,//only used by the specialized template
+           ///size_t num_aggregates,//only used by the specialized template
+           const VectorI& R_row_offsets,
+           const VectorI& R_column_indices,
+           const VectorI& g_row_offsets,
+           const VectorI& g_col_indices)
+           ///const VectorI& aggregates,//only used by the specialized template
+           ///const VectorI& cg_row_offsets,//only used by the specialized template
+           ///const VectorI& cg_col_indices,//only used by the specialized template
+           ///const VectorI& Ac_pos)//only used by the specialized template
+    {
+      // PtrI ptr(&seq[0]);  
+      // int* raw_ptr = ptr.get();
+      // PtrI ptr0(raw_ptr);
+      // range_view<PtrI> rv0(ptr0, ptr0+n);
+
+      size_t n_s = v_s_.size();
+      PtrV ptr_src(&v_s_[0]);
+      //ValueT* p_s = v_s_.data().get();
+      VWrapper g_edge_data(ptr_src, ptr_src+n_s);
+      ///VWrapper g_edge_data(v_s_.cbegin(), v_s_.cend());//nope...
+
+      size_t n_d = v_d_.size();
+      PtrV ptr_dst(&v_d_[0]);
+      //ValueT* p_d = v_d_.data().get();
+      VWrapper cg_edge_data(ptr_dst, ptr_dst+n_d);
+      //R == S
+      //
+      SemiringContractionUtilities<VectorI, VectorV, VWrapper,VertexCombineFctr,VertexReduceFctr,EdgeCombineFctr,EdgeReduceFctr>
+        sr(g_row_offsets,
+           g_col_indices,
+           R_row_offsets,
+           R_column_indices,
+           m_v_combine,
+           m_v_reduce,
+           m_e_combine,
+           m_e_reduce);
+
+      sr.update_edge_data(g_edge_data, cg_edge_data);
+    }
+      
+    const VectorV& get_cg_vals(void) const
+    {
+      return v_d_;
+    }
+  private:
+    /*const */VectorV& v_s_;
+    VectorV& v_d_;
+
+    VertexCombineFctr& m_v_combine;
+    VertexReduceFctr&  m_v_reduce;
+    EdgeCombineFctr&   m_e_combine;
+    EdgeReduceFctr&    m_e_reduce;
+  };
+
+  //partial specialization for (Combine, Reduce) == (*,+)
+  //
+  // template<typename VectorV, 
+  //        typename VectorI,
+  //        size_t CTA_SIZE>
+  // struct ContractionValueUpdater<VectorV,
+  //                                VectorI,
+  //                                thrust::multiplies<typename VectorV::value_type>,
+  //                                thrust::plus<typename VectorV::value_type>,
+  //                                thrust::multiplies<typename VectorV::value_type>,
+  //                                thrust::plus<typename VectorV::value_type>,
+  //                                CTA_SIZE>
+  // {
+  //   typedef typename VectorI::value_type IndexT;
+  //   //typedef typename VectorPtrT<typename VectorI::value_type,VectorV>::PtrT PtrI;
+
+  //   typedef typename VectorV::value_type ValueT;
+  //   typedef typename VectorPtrT<typename VectorV::value_type,VectorV>::PtrT PtrV;
+
+  //   //v_src, v_dest assumed pre-allocated!
+  //   //
+  //   ContractionValueUpdater(/*const */VectorV& v_src,
+  //               VectorV& v_dest,
+  //                           thrust::multiplies<ValueT>& ,
+  //                           thrust::plus<ValueT>&  ,
+  //                           thrust::multiplies<ValueT>& ,
+  //                           thrust::plus<ValueT>& ):
+  //     v_s_(v_src),
+  //     v_d_(v_dest)
+  //   {
+  //   }
+
+  //   void update_from(Hash_Workspace<IndexT,ValueT>& hash_wk,
+  //            size_t num_aggregates,
+  //            const VectorI& R_row_offsets,
+  //            const VectorI& R_column_indices,
+  //            const VectorI& g_row_offsets,
+  //            const VectorI& g_col_indices,
+  //            const VectorI& aggregates,
+  //            const VectorI& cg_row_offsets,
+  //            const VectorI& cg_col_indices,
+  //            const VectorI& Ac_pos)
+  //   {
+  //     fill_A_dispatch<CTA_SIZE>(hash_wk,
+  //                 num_aggregates, 
+  //                 R_row_offsets.data().get(), 
+  //                 R_column_indices.data().get(), 
+  //                 g_row_offsets.data().get(),
+  //                 g_col_indices.data().get(),
+  //                 v_s_.data().get(),
+  //                 aggregates.data().get(), 
+  //                 cg_row_offsets.data().get(), 
+  //                 cg_col_indices.data().get(), 
+  //                 thrust::raw_pointer_cast( &Ac_pos.front() ),
+  //                 v_d_.data().get());
+  //     cudaCheckError();
+  //   }
+      
+  //   const VectorV& get_cg_vals(void) const
+  //   {
+  //     return v_d_;
+  //   }
+  // private:
+  //   /*const */VectorV& v_s_;
+  //   VectorV& v_d_;
+  // };
+
+  
+
+
+  template<typename VectorI,
+       typename VectorV,
+           typename VertexCombineFctr,  //vertex "multiplication" functor type
+           typename VertexReduceFctr,   //vertex "addition" functor type
+           typename EdgeCombineFctr,    //edge "multiplication" functor type
+           typename EdgeReduceFctr,     //edge "addition" functor type
+       typename VectorB = VectorI,
+       size_t CTA_SIZE = 128>
+  struct GraphContractionFunctor
+  {
+    typedef typename VectorI::value_type IndexT;
+    typedef typename VectorV::value_type ValueT;
+    typedef typename VectorB::value_type ValueB;
+
+    typedef typename VectorPtrT<typename VectorB::value_type,VectorB>::PtrT PtrB;
+    typedef typename VectorPtrT<typename VectorI::value_type,VectorI>::PtrT PtrI;
+    typedef typename VectorPtrT<typename VectorV::value_type,VectorV>::PtrT PtrV;
+    //       num_aggregates != m_aggregates.size()!!!
+    //       Need m_num_aggregates const member
+    //
+    GraphContractionFunctor(size_t g_n_vertices,
+                            const VectorI& aggregates, /*const */
+                            size_t num_aggregates,
+                            VertexCombineFctr& v_combine,
+                            VertexReduceFctr&  v_reduce,
+                            EdgeCombineFctr&   e_combine,
+                            EdgeReduceFctr&    e_reduce):
+      m_num_rows(g_n_vertices), 
+      m_aggregates(aggregates),
+      m_num_aggregates(num_aggregates),
+      m_v_combine(v_combine),
+      m_v_reduce(v_reduce),
+      m_e_combine(e_combine),
+      m_e_reduce(e_reduce)
+    {
+      computeRestrictionOperator();
+      cudaCheckError();
+    }
+
+    virtual ~GraphContractionFunctor(void)
+    {
+    }
+
+    const VectorI& get_aggregates(void) const
+    {
+      return m_aggregates;
+    }
+
+    size_t get_num_aggregates(void) const
+    {
+      return m_num_aggregates;
+    }
+
+    const VectorI& get_R_row_offsets(void) const
+    {
+      return m_R_row_offsets;
+    }
+    
+    const VectorI& get_R_column_indices(void) const
+    {
+       return m_R_column_indices;
+    }
+
+  VertexCombineFctr& get_v_combine(void)
+  {
+    return m_v_combine;
+  }
+
+    VertexReduceFctr&  get_v_reduce(void)
+  {
+    return m_v_reduce;
+  }
+
+    EdgeCombineFctr&   get_e_combine(void)
+  {
+    return m_e_combine;
+  }
+
+    EdgeReduceFctr&    get_e_reduce(void)
+  {
+    return m_e_reduce;
+  }
+    
+  protected:
+    void computeRestrictionOperator(void)
+    {
+      size_t n_aggregates = m_num_aggregates;//nope: m_aggregates.size();
+      m_R_row_offsets.resize(n_aggregates+1);//create one more row for the pseudo aggregate (?)
+      VectorI R_row_indices(m_aggregates);
+
+      m_R_column_indices.resize(m_num_rows);
+      thrust::sequence(m_R_column_indices.begin(),m_R_column_indices.end());
+      cudaCheckError();
+
+      thrust::sort_by_key(R_row_indices.begin(),R_row_indices.end(),m_R_column_indices.begin());
+      cudaCheckError();
+
+      thrust::lower_bound(R_row_indices.begin(),
+        R_row_indices.end(),
+        thrust::counting_iterator<ValueT>(0),
+        thrust::counting_iterator<ValueT>(m_R_row_offsets.size()),
+        m_R_row_offsets.begin());
+      cudaCheckError();
+    }
+
+    //code "parked" for the time being;
+    //it uses the AMGX approach which has a bug
+    //un-debuggable due to nvcc failure with -g -G pair
+    //(bug: https://nvbugswb.nvidia.com/NvBugs5/SWBug.aspx?bugid=1813290&cmtNo)
+    //
+    struct NoValueUpdater
+  {
+    void update_from(///Hash_Workspace<IndexT,ValueT>& hash_wk,
+                       ///size_t num_aggregates,
+                       const VectorI& R_row_offsets,
+                       const VectorI& R_column_indices,
+                       const VectorI& g_row_offsets,
+                       const VectorI& g_col_indices)
+           ///const VectorI& aggregates,
+           ///const VectorI& cg_row_offsets,
+           ///const VectorI& cg_col_indices,
+           ///const VectorI& Ac_pos)
+    {
+    //no-op...
+    }
+  };
+
+    virtual void operator() (VectorI& g_row_ptr_,
+               VectorI& g_col_ind_)
+    {
+    NoValueUpdater updater;//dummy object...
+
+    contract(g_row_ptr_, g_col_ind_, updater);
+    }
+
+    virtual void operator () (VectorV& g_vals_,
+                VectorI& g_row_ptr_,
+                VectorI& g_col_ind_)
+    {
+    ContractionValueUpdater<VectorV,
+                              VectorI,
+                              VertexCombineFctr,
+                              VertexReduceFctr,
+                              EdgeCombineFctr,
+                              EdgeReduceFctr,
+                              CTA_SIZE>
+        updater(g_vals_,
+                m_cg_values,
+                m_v_combine,
+                m_v_reduce,
+                m_e_combine,
+                m_e_reduce);
+      
+    contract(g_row_ptr_, g_col_ind_, updater); 
+    }
+    
+    const VectorI& get_row_ptr(void) const
+    {
+      return m_cg_row_offsets;
+    }
+    
+    const VectorI& get_col_ind(void) const
+    {
+      return m_cg_col_indices;
+    }
+
+    IndexT get_subg_nnz(void) const
+    {
+      return m_cg_row_offsets.back();
+    }
+
+    template<typename ValUpdaterFctr>
+    void contract(VectorI& g_row_offsets, //contracted
+      VectorI& g_col_indices, //contracted
+      ValUpdaterFctr fctrv)
+    {
+      //notation mapping from AMGX->nvGRAPH:
+      //
+      //S (Restriction) matrix data:
+      //R_row_offsets          -> m_R_row_offsets
+      //R_column_indices       -> m_R_column_indices
+      //
+      //Graph matrix data:
+      //A.row_offsets          -> g_row_offsets
+      //A.col_indices          -> g_col_indices
+      //
+      //Contracted matrix data:
+      //Ac.row_offsets         -> m_cg_row_offsets
+      //Ac.col_indices         -> m_cg_col_indices
+      //
+      //num_aggregates != m_aggregates.size()!!!
+      //
+      ///size_t num_aggregates = m_aggregates.size(); //nope...
+      //size_t sz_aggregates = m_aggregates.size();
+      // TODO: check why no size() for amgx::IVector
+
+      m_cg_row_offsets.resize( m_num_aggregates+1 );
+
+      //##### update topology:
+      //{
+      // Hash_Workspace<IndexT,ValueT> hash_wk;
+
+      // compute_sparsity_dispatch<CTA_SIZE, false, true>(hash_wk, 
+      //                                                  m_num_aggregates,//????? 
+      //                                                  m_R_row_offsets.data().get(), 
+      //                                                  m_R_column_indices.data().get(), 
+      //                                                  g_row_offsets.data().get(), 
+      //                                                  g_col_indices.data().get(), 
+      //                                                  m_aggregates.data().get(), 
+      //                                                  m_cg_row_offsets.data().get(),
+      //                                                  static_cast<IndexT*>(0), //ok
+      //                                                  static_cast<IndexT*>(0));//ok
+      // cudaCheckError();
+
+      // // Compute the number of non-zeroes.
+      // thrust::exclusive_scan( m_cg_row_offsets.begin(), m_cg_row_offsets.end(), m_cg_row_offsets.begin() );
+      // cudaCheckError();
+
+      ///IndexT nonzero_blocks = m_cg_row_offsets[m_num_aggregates];
+
+      // // Vector to store the positions in the hash table.
+      ///VectorI Ac_pos(nonzero_blocks);
+
+      // compute_sparsity_dispatch<CTA_SIZE, false, false>(hash_wk, 
+      //                                                   m_num_aggregates,///????? 
+      //                                                   m_R_row_offsets.data().get(), 
+      //                                                   m_R_column_indices.data().get(), 
+      //                                                   g_row_offsets.data().get(), 
+      //                                                   g_col_indices.data().get(), 
+      //                                                   m_aggregates.data().get(), 
+      //                                                   m_cg_row_offsets.data().get(), 
+      //                                                   m_cg_col_indices.data().get(),
+      //                                                   thrust::raw_pointer_cast( &Ac_pos.front() ));
+      // cudaCheckError();
+      //} end update topology
+
+      //##### update values:
+      //{
+      //act (or not) on values:
+      //
+      fctrv.update_from(///hash_wk,
+            ///m_num_aggregates,///????? 
+            m_R_row_offsets, 
+            m_R_column_indices,
+            g_row_offsets, 
+            g_col_indices);
+            ///m_aggregates, 
+            ///m_cg_row_offsets, 
+            ///m_cg_col_indices, 
+            ///Ac_pos);
+      //}end update values
+      
+    }
+
+  private:
+    size_t m_num_rows;    // number of vertices in the original graph
+    VectorI m_aggregates; // labels of vertices to be collapsed (vertices with same label will be collapsed into one)
+    const size_t m_num_aggregates; // != m_aggregates.size() !!!
+
+    //Restrictor CSR info
+    //Restrictor = S "matrix" in algorithm 4.5 in "Graph Algorithms in the language of Linear Algebra")
+    VectorI m_R_row_offsets;
+    VectorI m_R_column_indices;
+
+    //Contracted graph data:
+    VectorI m_cg_row_offsets;
+    VectorI m_cg_col_indices;
+    VectorV m_cg_values;
+
+    //Contraction functors:
+    //
+    VertexCombineFctr& m_v_combine;
+    VertexReduceFctr&  m_v_reduce;
+    EdgeCombineFctr&   m_e_combine;
+    EdgeReduceFctr&    m_e_reduce;
+  };
+
+namespace{ //unnamed..
+  template<typename VectorI>
+  size_t validate_contractor_input(const VectorI& v, size_t g_nrows)
+  {
+    typedef typename VectorI::value_type IndexT;
+    typedef typename VectorI::iterator Iterator;
+
+    size_t n = v.size();
+
+    if( n == 0 )
+      FatalError("0-sized array input in graph contraction.",NVGRAPH_ERR_BAD_PARAMETERS);
+
+     if( n != g_nrows )
+      FatalError("Aggregate array size must match number of vertices of original graph",NVGRAPH_ERR_BAD_PARAMETERS);
+
+     //find min/max values in aggregates...
+     //and check if min==0 and max <= g_nrows-1...
+     VectorI res(v);//copy
+     cudaCheckError();
+     thrust::pair<Iterator, Iterator> result = thrust::minmax_element(res.begin(), res.end());
+     if( *result.first != 0 )
+       FatalError("Aggregate array values must start from 0.",NVGRAPH_ERR_BAD_PARAMETERS);
+     cudaCheckError();
+
+     if( static_cast<size_t>(*result.second) > g_nrows-1 )
+       FatalError("Aggregate array values must be less than number of vertices of original graph.",NVGRAPH_ERR_BAD_PARAMETERS);
+
+     //then make sure all values in between are covered...
+     //use count_distinct() and see if there are max-min+1
+     size_t n_expected = *result.second - *result.first + 1;
+     
+     thrust::sort(res.begin(), res.end());
+     cudaCheckError();
+     size_t counts = thrust::distance(res.begin(), thrust::unique(res.begin(), res.end()));
+     cudaCheckError();
+
+     if( counts != n_expected )
+       FatalError("Aggregate array intermediate values (between 0 and max(aggregates)) are missing.",NVGRAPH_ERR_BAD_PARAMETERS);
+
+     //return # aggregates (not to be confused with aggregates.size()!)
+     return n_expected;
+  }
+}//end unnamed namespace
+
+
+  //(the C header will have something similar)
+  //add more enums for additional Functor Types;
+  //
+  //CAVEAT: NrFctrTypes MUST be last in enum!
+  //additions can be made anywhere between enum...=0 and NrFctrTypes!
+  //
+  typedef enum{Multiply=0, Sum, Min, Max, NrFctrTypes} SemiRingFunctorTypes;
+
+  //Partial specialization to select proper
+  //functor through an integer, at compile time (?)
+  //
+  template<SemiRingFunctorTypes, typename ValueT> 
+  struct SemiRingFctrSelector;
+
+  template<typename ValueT>
+  struct SemiRingFctrSelector<Multiply, ValueT>
+  {
+    typedef typename thrust::multiplies<ValueT> FctrType;
+  };
+
+  template<typename ValueT>
+  struct SemiRingFctrSelector<Sum, ValueT>
+  {
+    typedef typename thrust::plus<ValueT> FctrType;
+  };
+
+  template<typename ValueT>
+  struct SemiRingFctrSelector<Min, ValueT>
+  {
+    typedef typename thrust::minimum<ValueT> FctrType;
+  };
+
+  template<typename ValueT>
+  struct SemiRingFctrSelector<Max, ValueT>
+  {
+    typedef typename thrust::maximum<ValueT> FctrType;
+  };
+
+  //...add more specializations for additional Functor Types
+
+  //Acyclic Visitor
+  //         (A. Alexandrescu, "Modern C++ Design", Section 10.4), 
+  //         where *concrete* Visitors must be parameterized by all 
+  //         the possibile template args of the Visited classes (visitees);
+  //
+
+  //Visitor for SubGraph extraction:
+  //
+  template<typename VectorI, 
+           typename VectorV,
+           typename VertexCombineFctr,  //vertex "multiplication" functor type
+           typename VertexReduceFctr,   //vertex "addition" functor type
+           typename EdgeCombineFctr,    //edge "multiplication" functor type
+           typename EdgeReduceFctr>     //edge "addition" functor type>
+  struct GraphContractionVisitor: 
+    VisitorBase,
+    Visitor<Graph<typename VectorI::value_type> >,
+    Visitor<CsrGraph<typename VectorI::value_type> >,
+    Visitor<ValuedCsrGraph<typename VectorI::value_type, typename VectorV::value_type> >,
+    Visitor<MultiValuedCsrGraph<typename VectorI::value_type, typename VectorV::value_type> >
+  {
+    typedef typename VectorI::value_type IndexType_;
+    typedef typename VectorV::value_type ValueType_;
+    typedef typename VectorPtrT<typename VectorI::value_type,VectorI>::PtrT PtrI;
+  typedef typename VectorPtrT<typename VectorV::value_type,VectorV>::PtrT PtrV;
+  typedef range_view<PtrV> VWrapper;
+
+    typedef GraphContractionFunctor<VectorI,
+                                    VectorV,
+                                    VertexCombineFctr,
+                                    VertexReduceFctr,
+                                    EdgeCombineFctr,
+                                    EdgeReduceFctr > CFunctor;
+
+    //TODO: avoid copy from raw pointer
+    //
+    GraphContractionVisitor(CsrGraph<IndexType_>& graph,    
+                            const VectorI& aggregates, /*const */
+                            cudaStream_t stream,
+                            VertexCombineFctr& v_combine,
+                            VertexReduceFctr&  v_reduce,
+                            EdgeCombineFctr&   e_combine,
+                            EdgeReduceFctr&    e_reduce):
+      m_g_row_ptr_(graph.get_raw_row_offsets(),
+               graph.get_raw_row_offsets()+graph.get_num_vertices()+1),
+      m_g_col_ind_(graph.get_raw_column_indices(),
+               graph.get_raw_column_indices()+graph.get_num_edges()),
+      //       num_aggregates != m_aggregates.size()!!!
+      //       need to calculate num_aggregates (validate_..() does it)
+      //       and pass it to contractor:
+      //
+      contractor_(graph.get_num_vertices(),
+                  aggregates,
+                  validate_contractor_input(aggregates, graph.get_num_vertices()),
+                  v_combine,
+                  v_reduce,
+                  e_combine,
+                  e_reduce),
+      stream_(stream),
+      contracted_graph_(0)
+    {
+      cudaCheckError();
+      //empty...
+    }
+
+    void Visit(Graph<IndexType_>& graph)
+    {
+      //no-op...
+    }
+
+    void Visit(CsrGraph<IndexType_>& graph_src)
+    {
+      //(non-AMGX version):
+      //SemiRing::update_topology(contractor_.get_row_ptr(), contractor_.get_col_ind());
+      typedef typename SemiRingFctrSelector<Multiply, ValueType_>::FctrType MultiplyFctr;
+      typedef typename SemiRingFctrSelector<Sum, ValueType_>::FctrType SumFctr;
+
+      MultiplyFctr mult;
+      SumFctr sum;
+
+    SemiringContractionUtilities<VectorI, VectorV, VWrapper,MultiplyFctr,SumFctr,MultiplyFctr,SumFctr>
+        sr(m_g_row_ptr_,
+           m_g_col_ind_,
+           contractor_.get_R_row_offsets(),
+           contractor_.get_R_column_indices(),
+           mult,
+           sum,
+           mult,
+           sum);
+      
+      sr.update_topology_only();
+      
+      ///contractor_(m_g_row_ptr_, m_g_col_ind_);//just drop it, no-op, here, all work done by sr
+
+      size_t rowptr_sz = sr.get_row_ptr().size();
+      assert( rowptr_sz >= 1 );
+
+      size_t contrctd_nrows = rowptr_sz-1;
+      size_t contrctd_nnz = sr.get_subg_nnz();
+
+      if( contracted_graph_ )
+        delete contracted_graph_;
+      
+      contracted_graph_ = new CsrGraph<IndexType_>(contrctd_nrows, contrctd_nnz, stream_);
+
+      //TODO: more efficient solution: investigate if/how copy can be avoided
+      //
+      thrust::copy(sr.get_row_ptr().begin(), sr.get_row_ptr().end(), contracted_graph_->get_raw_row_offsets());
+      cudaCheckError();
+      thrust::copy(sr.get_col_ind().begin(), sr.get_col_ind().end(), contracted_graph_->get_raw_column_indices());
+      cudaCheckError();
+    }
+
+    void Visit(ValuedCsrGraph<IndexType_,ValueType_>& graph_src)
+    {
+      size_t g_nrows = graph_src.get_num_vertices();
+      size_t g_nnz = graph_src.get_num_edges();
+
+      VectorV vals(graph_src.get_raw_values(), graph_src.get_raw_values()+g_nnz);
+
+      //(non-AMGX version):
+      //SemiRing::update_topology(contractor_.get_row_ptr(), contractor_.get_col_ind());
+      typedef typename SemiRingFctrSelector<Multiply, ValueType_>::FctrType MultiplyFctr;
+      typedef typename SemiRingFctrSelector<Sum, ValueType_>::FctrType SumFctr;
+
+      MultiplyFctr mult;
+      SumFctr sum;
+
+    SemiringContractionUtilities<VectorI, VectorV, VWrapper,MultiplyFctr,SumFctr,MultiplyFctr,SumFctr>
+        sr(m_g_row_ptr_,
+           m_g_col_ind_,
+           contractor_.get_R_row_offsets(),
+           contractor_.get_R_column_indices(),
+           mult,
+           sum,
+           mult,
+           sum);
+      
+      sr.update_topology_only();
+      
+      ///contractor_(vals, m_g_row_ptr_, m_g_col_ind_);//just drop it, no-op, here, all work done by sr and updater, below
+      
+      size_t rowptr_sz = sr.get_row_ptr().size();
+      assert( rowptr_sz >= 1 );
+
+      size_t contrctd_nrows = rowptr_sz-1;
+      size_t contrctd_nnz = sr.get_subg_nnz();
+
+      ValuedCsrGraph<IndexType_,ValueType_>* subg = new ValuedCsrGraph<IndexType_,ValueType_>(contrctd_nrows, contrctd_nnz, stream_);
+
+      //TODO: more efficient solution: investigate if/how copy can be avoided
+      //
+      thrust::copy(sr.get_row_ptr().begin(), sr.get_row_ptr().end(), subg->get_raw_row_offsets());
+      cudaCheckError();
+      thrust::copy(sr.get_col_ind().begin(), sr.get_col_ind().end(), subg->get_raw_column_indices());
+      cudaCheckError();
+
+      //handling the values:
+      //
+      VertexCombineFctr v_combine;
+      VertexReduceFctr  v_reduce;
+      EdgeCombineFctr   e_combine;
+      EdgeReduceFctr    e_reduce;
+
+      //TODO: more efficient solution with VWrapper, to avoid device memory traffic
+      //
+      VectorV cg_values(subg->get_raw_values(), subg->get_raw_values()+contrctd_nnz);
+         
+      ContractionValueUpdater<VectorV,//VWrapper?
+                              VectorI,
+                              VertexCombineFctr,
+                              VertexReduceFctr,
+                              EdgeCombineFctr,
+                              EdgeReduceFctr,
+                              128>//useless...; only used with AMGX version
+        updater(vals,
+                cg_values,
+                v_combine,
+                v_reduce,
+                e_combine,
+                e_reduce);
+
+      updater.update_from(contractor_.get_R_row_offsets(),
+                          contractor_.get_R_column_indices(),
+                          m_g_row_ptr_,
+                          m_g_col_ind_);
+                          
+
+      //TODO: more efficient solution with VWrapper, to avoid device memory traffic
+      //
+      thrust::copy(cg_values.begin(), cg_values.end(), subg->get_raw_values());
+      cudaCheckError();
+      
+
+      if( contracted_graph_ )
+        delete contracted_graph_;
+      
+      contracted_graph_ = subg;
+    }
+
+    void Visit(MultiValuedCsrGraph<IndexType_,ValueType_>& graph_src)
+    {
+       //(non-AMGX version):
+      //SemiRing::update_topology(contractor_.get_row_ptr(), contractor_.get_col_ind());
+      typedef typename SemiRingFctrSelector<Multiply, ValueType_>::FctrType MultiplyFctr;
+      typedef typename SemiRingFctrSelector<Sum, ValueType_>::FctrType SumFctr;
+
+      MultiplyFctr mult;
+      SumFctr sum;
+
+    SemiringContractionUtilities<VectorI, VectorV, VWrapper,MultiplyFctr,SumFctr,MultiplyFctr,SumFctr>
+        sr(m_g_row_ptr_,
+           m_g_col_ind_,
+           contractor_.get_R_row_offsets(),
+           contractor_.get_R_column_indices(),
+           mult,
+           sum,
+           mult,
+           sum);
+      cudaCheckError();
+      sr.update_topology_only();
+      cudaCheckError();
+            
+      ///contractor_(m_g_row_ptr_, m_g_col_ind_);//just drop it, no-op, here, all work done by sr and reduce_*_data(), below
+      
+      //construct the contracted graph out of contractor_ newly acquired data
+    size_t rowptr_sz = sr.get_row_ptr().size();
+      assert( rowptr_sz >= 1 );
+
+      size_t contrctd_nrows = rowptr_sz-1;
+      size_t contrctd_nnz = sr.get_subg_nnz();
+      cudaCheckError();
+
+      if( contracted_graph_ )
+        delete contracted_graph_;
+      cudaCheckError();
+      
+      MultiValuedCsrGraph<IndexType_,ValueType_>* mv_cntrctd_graph = 
+        new MultiValuedCsrGraph<IndexType_,ValueType_>(contrctd_nrows, contrctd_nnz, stream_);
+
+      cudaCheckError();
+
+      //TODO: more efficient solution: investigate if/how copy can be avoided
+      //
+      thrust::copy(sr.get_row_ptr().begin(), sr.get_row_ptr().end(), mv_cntrctd_graph->get_raw_row_offsets());
+      cudaCheckError();
+      thrust::copy(sr.get_col_ind().begin(), sr.get_col_ind().end(), mv_cntrctd_graph->get_raw_column_indices());
+      cudaCheckError();
+
+
+      //reduce vertex and edge data for the contracted graph
+      reduce_vertex_data(graph_src, *mv_cntrctd_graph);
+      reduce_edge_data(graph_src, *mv_cntrctd_graph);
+
+      contracted_graph_ = mv_cntrctd_graph;
+    }
+
+    const CFunctor& get_contractor(void) const
+    {
+      return contractor_;
+    }
+
+    CsrGraph<IndexType_>* get_contracted_graph(void) // TODO: change to unique_ptr, when moving to C++1*
+    {
+      return contracted_graph_;
+    }
+
+    const VectorI& get_aggregates(void) const
+    {
+      return contractor_.get_aggregates();
+    }
+
+  protected:
+    //virtual reductors for contracted vertices and edges:
+    //
+    virtual void reduce_vertex_data(MultiValuedCsrGraph<IndexType_,ValueType_>& graph_src,
+                                    MultiValuedCsrGraph<IndexType_,ValueType_>& graph_dest)
+    {
+    SemiringContractionUtilities<VectorI, VectorV, VWrapper,VertexCombineFctr,VertexReduceFctr,EdgeCombineFctr,EdgeReduceFctr>
+        sr(m_g_row_ptr_,
+           m_g_col_ind_,
+           contractor_.get_R_row_offsets(),
+           contractor_.get_R_column_indices(),
+           contractor_.get_v_combine(),
+           contractor_.get_v_reduce(),
+           contractor_.get_e_combine(),
+           contractor_.get_e_reduce());
+      cudaCheckError();
+
+      if ( graph_dest.get_num_vertices() == 0 )
+        FatalError("Empty contracted graph (no vertices).",NVGRAPH_ERR_BAD_PARAMETERS);
+
+      //allocate graph_dest vertex data and fill it:
+    //
+    size_t ng = graph_src.get_num_vertex_dim();
+    graph_dest.allocateVertexData(ng, stream_);
+    cudaCheckError();
+
+    for(unsigned int i=0;i<ng;++i)
+    {
+      Vector<ValueType_>& v_src = graph_src.get_vertex_dim(i);
+      Vector<ValueType_>& v_dest = graph_dest.get_vertex_dim(i);
+
+      size_t n_src = v_src.get_size();
+      PtrV ptr_src(v_src.raw());
+      VWrapper rv_src(ptr_src, ptr_src+n_src);
+
+      size_t n_dest = v_dest.get_size();
+      assert( graph_dest.get_num_vertices() == n_dest );
+
+      PtrV ptr_dest(v_dest.raw());
+      VWrapper rv_dest(ptr_dest, ptr_dest+n_dest);
+
+      sr.update_vertex_data(rv_src, rv_dest);
+      cudaCheckError();
+    }
+    }
+
+    virtual void reduce_edge_data(MultiValuedCsrGraph<IndexType_,ValueType_>& graph_src,
+                                  MultiValuedCsrGraph<IndexType_,ValueType_>& graph_dest)
+    {
+    SemiringContractionUtilities<VectorI, VectorV, VWrapper,VertexCombineFctr,VertexReduceFctr,EdgeCombineFctr,EdgeReduceFctr>
+        sr(m_g_row_ptr_,
+           m_g_col_ind_,
+           contractor_.get_R_row_offsets(),
+           contractor_.get_R_column_indices(),
+           contractor_.get_v_combine(),
+           contractor_.get_v_reduce(),
+           contractor_.get_e_combine(),
+           contractor_.get_e_reduce());
+      cudaCheckError();
+
+      //There can be a contracted graph with no edges,
+      //but such a case warrants a warning:
+      //
+      if ( graph_dest.get_num_edges() == 0 )
+        WARNING("Contracted graph is disjointed (no edges)");
+      
+      //allocate graph_dest edge data and fill it:
+    //
+    size_t ng = graph_src.get_num_edge_dim();
+    graph_dest.allocateEdgeData(ng, stream_);
+    cudaCheckError();
+
+    for(unsigned int i=0;i<ng;++i)
+    {
+      Vector<ValueType_>& v_src = graph_src.get_edge_dim(i);
+      Vector<ValueType_>& v_dest = graph_dest.get_edge_dim(i);
+
+      size_t n_src = v_src.get_size();
+      PtrV ptr_src(v_src.raw());
+      VWrapper rv_src(ptr_src, ptr_src+n_src);
+
+      size_t n_dest = v_dest.get_size();
+      assert( graph_dest.get_num_edges() == n_dest );
+
+      PtrV ptr_dest(v_dest.raw());
+      VWrapper rv_dest(ptr_dest, ptr_dest+n_dest);
+
+      sr.update_edge_data(rv_src, rv_dest);
+      cudaCheckError();
+    }
+    }
+
+  private:
+    VectorI m_g_row_ptr_;
+    VectorI m_g_col_ind_;
+    CFunctor contractor_;
+    cudaStream_t stream_;
+    CsrGraph<IndexType_>* contracted_graph_; // to be constructed
+  };
+
+
+  
+
+
+  //###################################################### Nested-if-then-else solution: 
+  //
+  //easier on number of recursive template instantiations
+  //i.e., less-likely to run into compilation problems like:
+  //'error: excessive recursion at instantiation of function ...';
+  //or the newly(as of cuda8.0) available flag: -ftemplate-depth <depth>
+  //
+  //generic empty template:
+  //
+  template<typename VectorI,
+       typename VectorV,
+       typename T1,
+       typename T2,
+       typename T3,
+       size_t Level, size_t n, size_t N>
+  struct NestedTypedIfThenElser;    
+
+  //Level 3 (ceiling of recursion):
+  //
+  template<typename VectorI,
+       typename VectorV,
+       typename T1,
+       typename T2,
+       typename T3,
+       size_t n, size_t N>
+  struct NestedTypedIfThenElser<VectorI, VectorV, T1, T2, T3, 3, n, N>
+  {
+  typedef typename VectorI::value_type IndexT;
+    typedef typename VectorV::value_type ValueT;
+
+  static CsrGraph<IndexT>* iffer(size_t i1, size_t i2, size_t i3, size_t i4, 
+                   CsrGraph<IndexT>& graph,
+                   VectorI& aggregates,
+                   cudaStream_t stream)
+  {
+    if( i4 == n )//reached both ceiling of Level recursion and bottom of n value recursion
+    {
+      ///std::cout<<"OK: tuple("<<i1<<","<<i2<<","<<i3<<","<<i4<<") hit!\n";//stop, everything hit...
+      typedef typename SemiRingFctrSelector<(SemiRingFunctorTypes)n, ValueT>::FctrType T4;
+
+      typedef T1 VertexCombineFctr;
+      typedef T2 VertexReduceFctr;
+      typedef T3 EdgeCombineFctr;
+      typedef T4 EdgeReduceFctr;
+
+      VertexCombineFctr v_combine;
+      VertexReduceFctr  v_reduce;
+      EdgeCombineFctr   e_combine;
+      EdgeReduceFctr    e_reduce;
+
+      GraphContractionVisitor<VectorI,
+                              VectorV,
+                              VertexCombineFctr,
+                              VertexReduceFctr,
+                              EdgeCombineFctr,
+                              EdgeReduceFctr>
+      visitor(graph,
+          aggregates,
+          stream,
+          v_combine,
+          v_reduce,
+          e_combine,
+          e_reduce);
+      cudaCheckError();
+
+      graph.Accept(visitor);
+      cudaCheckError();
+      return visitor.get_contracted_graph();
+    }
+    else //continue with same level (3), but next decreasing n value
+    return NestedTypedIfThenElser<VectorI, VectorV, T1, T2, T3, 3, n-1, N>::iffer(i1, i2, i3, i4, 
+                                            graph, 
+                                            aggregates,
+                                            stream); 
+  }
+  };
+
+  //Level 3 bottom:
+  //
+  template<typename VectorI,
+       typename VectorV,
+       typename T1,
+       typename T2,
+       typename T3,
+       size_t N>
+  struct NestedTypedIfThenElser<VectorI, VectorV, T1, T2, T3, 3, 0, N>
+  {
+  typedef typename VectorI::value_type IndexT;
+    typedef typename VectorV::value_type ValueT;
+
+  static CsrGraph<IndexT>* iffer(size_t i1, size_t i2, size_t i3, size_t i4, 
+                   CsrGraph<IndexT>& graph,
+                   VectorI& aggregates,
+                   cudaStream_t stream)
+  {
+    if( i4 == 0 )
+    {
+      ///std::cout<<"OK: tuple("<<i1<<","<<i2<<","<<i3<<","<<i4<<") hit!\n";//stop, everything hit...
+      typedef typename SemiRingFctrSelector<(SemiRingFunctorTypes)0, ValueT>::FctrType T4;
+
+      typedef T1 VertexCombineFctr;
+      typedef T2 VertexReduceFctr;
+      typedef T3 EdgeCombineFctr;
+      typedef T4 EdgeReduceFctr;
+
+      VertexCombineFctr v_combine;
+      VertexReduceFctr  v_reduce;
+      EdgeCombineFctr   e_combine;
+      EdgeReduceFctr    e_reduce;
+
+      GraphContractionVisitor<VectorI,
+                              VectorV,
+                              VertexCombineFctr,
+                              VertexReduceFctr,
+                              EdgeCombineFctr,
+                              EdgeReduceFctr>
+      visitor(graph,
+          aggregates,
+          stream,
+          v_combine,
+          v_reduce,
+          e_combine,
+          e_reduce);
+
+      graph.Accept(visitor);
+      return visitor.get_contracted_graph();
+    }
+    else
+    {
+      std:: stringstream ss;
+      ss<<"ERROR: tuple("<<i1<<","<<i2<<","<<i3<<","<<i4<<") not hit on Level 3.";
+      FatalError(ss.str().c_str(),NVGRAPH_ERR_BAD_PARAMETERS);
+      //return 0;
+    }
+  }
+  };
+
+  //Level 2 generic:
+  //
+  template<typename VectorI,
+       typename VectorV,
+       typename T1,
+       typename T2,
+       typename T3,
+       size_t n, size_t N>
+  struct NestedTypedIfThenElser<VectorI, VectorV, T1, T2, T3, 2, n, N>
+  {
+  typedef typename VectorI::value_type IndexT;
+    typedef typename VectorV::value_type ValueT;
+
+  static CsrGraph<IndexT>* iffer(size_t i1, size_t i2, size_t i3, size_t i4, 
+                   CsrGraph<IndexT>& graph,
+                   VectorI& aggregates,
+                   cudaStream_t stream)
+  {
+    if( i3 == n )
+    {
+      typedef typename SemiRingFctrSelector<(SemiRingFunctorTypes)n, ValueT>::FctrType RT;//replace T3!
+      return NestedTypedIfThenElser<VectorI, VectorV, T1, T2, RT, 3, N-1, N>::iffer(i1, i2, i3, i4, 
+                                         graph, 
+                                         aggregates,
+                                         stream);//continue with next increasing level (3)
+      //with 1st possible value (N-1)
+    }
+    else
+    return NestedTypedIfThenElser<VectorI, VectorV, T1, T2, T3, 2, n-1, N>::iffer(i1, i2, i3, i4, 
+                                         graph, 
+                                         aggregates,
+                                         stream);//continue with same level (2), but next decreasing n value 
+  }
+  };
+
+  //Level 2 bottom:
+  //
+  template<typename VectorI,
+       typename VectorV,
+       typename T1,
+       typename T2,
+       typename T3,
+       size_t N>
+  struct NestedTypedIfThenElser<VectorI, VectorV, T1, T2, T3, 2, 0, N>
+  {
+  typedef typename VectorI::value_type IndexT;
+    typedef typename VectorV::value_type ValueT;
+
+  static CsrGraph<IndexT>* iffer(size_t i1, size_t i2, size_t i3, size_t i4, 
+                   CsrGraph<IndexT>& graph,
+                   VectorI& aggregates,
+                   cudaStream_t stream)
+  {
+    if( i3 == 0 )
+    {
+      typedef typename SemiRingFctrSelector<(SemiRingFunctorTypes)0, ValueT>::FctrType RT;//replace T3!
+      return NestedTypedIfThenElser<VectorI, VectorV, T1, T2, RT, 3, N-1, N>::iffer(i1, i2, i3, i4, 
+                                         graph, 
+                                         aggregates,
+                                         stream);//continue with next increasing level (3)
+      //with 1st possible value (N-1)
+    }
+    else
+    {
+      std:: stringstream ss;
+      ss<<"ERROR: tuple("<<i1<<","<<i2<<","<<i3<<","<<i4<<") not hit on Level 2.";
+      FatalError(ss.str().c_str(),NVGRAPH_ERR_BAD_PARAMETERS);
+      //return 0;
+    }
+  }
+  };
+
+  //Level 1 generic:
+  //
+  template<typename VectorI,
+       typename VectorV,
+       typename T1,
+       typename T2,
+       typename T3,
+       size_t n, size_t N>
+  struct NestedTypedIfThenElser<VectorI, VectorV, T1, T2, T3, 1, n, N>
+  {
+  typedef typename VectorI::value_type IndexT;
+  typedef typename VectorV::value_type ValueT;
+
+  static CsrGraph<IndexT>* iffer(size_t i1, size_t i2, size_t i3, size_t i4, 
+                   CsrGraph<IndexT>& graph,
+                   VectorI& aggregates,
+                   cudaStream_t stream)
+  {
+    if( i2 == n )
+    {
+      typedef typename SemiRingFctrSelector<(SemiRingFunctorTypes)n, ValueT>::FctrType RT;//replace T2!
+      return NestedTypedIfThenElser<VectorI, VectorV, T1, RT, T3, 2, N-1, N>::iffer(i1, i2, i3, i4, 
+                                         graph, 
+                                         aggregates,
+                                         stream);//continue with next increasing level (2)
+      //with 1st possible value (N-1)
+    }
+    else
+    return NestedTypedIfThenElser<VectorI, VectorV, T1, T2, T3, 1, n-1, N>::iffer(i1, i2, i3, i4, 
+                                         graph, 
+                                         aggregates,
+                                         stream);//continue with same level (1), but next decreasing n value 
+  }
+  };
+
+  //Level 1 bottom:
+  //
+  template<typename VectorI,
+       typename VectorV,
+       typename T1,
+       typename T2,
+       typename T3,
+       size_t N>
+  struct NestedTypedIfThenElser<VectorI, VectorV, T1, T2, T3, 1, 0, N>
+  {
+  typedef typename VectorI::value_type IndexT;
+  typedef typename VectorV::value_type ValueT;
+
+  static CsrGraph<IndexT>* iffer(size_t i1, size_t i2, size_t i3, size_t i4, 
+                   CsrGraph<IndexT>& graph,
+                   VectorI& aggregates,
+                   cudaStream_t stream)
+  {
+    if( i2 == 0 )
+    {
+      typedef typename SemiRingFctrSelector<(SemiRingFunctorTypes)0, ValueT>::FctrType RT;//replace T2!
+      return NestedTypedIfThenElser<VectorI, VectorV, T1, RT, T3, 2, N-1, N>::iffer(i1, i2, i3, i4, 
+                                         graph, 
+                                         aggregates,
+                                         stream);//continue with next increasing level (2)
+      //with 1st possible value (N-1)
+    }
+    else
+    {
+      std:: stringstream ss;
+      ss<<"ERROR: tuple("<<i1<<","<<i2<<","<<i3<<","<<i4<<") not hit on Level 1.";
+      FatalError(ss.str().c_str(),NVGRAPH_ERR_BAD_PARAMETERS);
+      //return 0;
+    }
+  }
+  };
+
+  //Level 0 generic:
+  //
+  template<typename VectorI,
+       typename VectorV,
+       typename T1,
+       typename T2,
+       typename T3,
+       size_t n, size_t N>
+  struct NestedTypedIfThenElser<VectorI, VectorV, T1, T2, T3, 0, n, N>
+  {
+  typedef typename VectorI::value_type IndexT;
+  typedef typename VectorV::value_type ValueT;
+
+  static CsrGraph<IndexT>* iffer(size_t i1, size_t i2, size_t i3, size_t i4, 
+                   CsrGraph<IndexT>& graph,
+                   VectorI& aggregates,
+                   cudaStream_t stream)
+  {
+    if( i1 == n )
+    {
+      typedef typename SemiRingFctrSelector<(SemiRingFunctorTypes)n, ValueT>::FctrType RT;//replace T1!
+      return NestedTypedIfThenElser<VectorI, VectorV, RT, T2, T3, 1, N-1, N>::iffer(i1, i2, i3, i4, 
+                                     graph, 
+                                     aggregates,
+                                     stream);//continue with next increasing level (1)
+      //with 1st possible value (N-1)
+    }
+    else
+    return NestedTypedIfThenElser<VectorI, VectorV, T1, T2, T3, 0, n-1, N>::iffer(i1, i2, i3, i4, 
+                                   graph, 
+                                   aggregates,
+                                   stream);//continue with same level (0), but next decreasing n value 
+  }
+  };
+
+  //Level 0 bottom:
+  //
+  template<typename VectorI,
+       typename VectorV,
+       typename T1,
+       typename T2,
+       typename T3,
+       size_t N>
+  struct NestedTypedIfThenElser<VectorI, VectorV, T1, T2, T3, 0, 0, N>
+  {
+  typedef typename VectorI::value_type IndexT;
+  typedef typename VectorV::value_type ValueT;
+
+  static CsrGraph<IndexT>* iffer(size_t i1, size_t i2, size_t i3, size_t i4, 
+                   CsrGraph<IndexT>& graph,
+                   VectorI& aggregates,
+                   cudaStream_t stream)
+  {
+    if( i1 == 0 )
+    {
+      typedef typename SemiRingFctrSelector<(SemiRingFunctorTypes)0, ValueT>::FctrType RT;//replace T1!
+      return NestedTypedIfThenElser<VectorI, VectorV, RT, T2, T3, 1, N-1, N>::iffer(i1, i2, i3, i4, 
+                                         graph, 
+                                         aggregates,
+                                         stream);//continue with next increasing level (1)
+      //with 1st possible value (N-1)
+    }
+    else
+    {
+      std:: stringstream ss;
+      ss<<"ERROR: tuple("<<i1<<","<<i2<<","<<i3<<","<<i4<<") not hit on Level 0.";
+      FatalError(ss.str().c_str(),NVGRAPH_ERR_BAD_PARAMETERS);
+      //return 0;
+    }
+  }
+  };
+
+  //Wrapper:
+  //
+  //N = # possible (consecutive 0-based) values
+  //that each tuple element can take
+  //
+  template<typename VectorI,
+       typename VectorV, 
+       size_t N>
+  struct NestedTypedIfThenElseWrapper
+  {
+  typedef typename VectorI::value_type IndexT;
+    typedef typename VectorV::value_type ValueT;
+
+  struct Unused{};//placeholder to be replaced by actual types
+  
+  static CsrGraph<IndexT>* iffer(size_t i1, size_t i2, size_t i3, size_t i4, 
+                   CsrGraph<IndexT>& graph,
+                   VectorI& aggregates,
+                   cudaStream_t stream)
+  {
+    return NestedTypedIfThenElser<VectorI, VectorV, Unused, Unused, Unused, 0, N-1, N>::iffer(i1, i2, i3, i4, 
+                                                graph, 
+                                                aggregates,
+                                                stream);
+  }
+  };
+
+
+  template<typename VectorI,
+       typename VectorV, 
+       typename T1,
+       size_t N>
+  struct NestedTypedIfThenElseWrapperT
+  {
+    typedef typename VectorI::value_type IndexT;
+    typedef typename VectorV::value_type ValueT;
+
+  struct Unused{};//placeholder to be replaced by actual types
+  
+  static CsrGraph<IndexT>* iffer(size_t i1, size_t i2, size_t i3, size_t i4, 
+                   CsrGraph<IndexT>& graph,
+                   VectorI& aggregates,
+                   cudaStream_t stream)
+  {
+    return NestedTypedIfThenElser<VectorI, VectorV, T1, Unused, Unused, 1, N-1, N>::iffer(i1, i2, i3, i4, 
+                                                graph, 
+                                                aggregates,
+                                                stream);
+  }
+  };
+
+
+  
+
+  template<typename IndexT, typename ValueT>
+  CsrGraph<IndexT>* contract_from_aggregates(CsrGraph<IndexT>& graph, 
+                                             IndexT* p_aggregates,
+                                             size_t n,
+                                             cudaStream_t stream,
+                                             const SemiRingFunctorTypes& vCombine,
+                                             const SemiRingFunctorTypes& vReduce,
+                                             const SemiRingFunctorTypes& eCombine,
+                                             const SemiRingFunctorTypes& eReduce)
+  {
+    typedef thrust::device_vector<IndexT> VectorI;
+    typedef thrust::device_vector<ValueT> VectorV;
+
+    VectorI aggregates(p_aggregates, p_aggregates+n);
+
+  //Nested if-then-else solution:
+  //
+  //(no need for constness, they're NOT template args)
+  //
+  return NestedTypedIfThenElseWrapper<VectorI, VectorV, NrFctrTypes>::iffer((size_t)vCombine, 
+                                      (size_t)vReduce, 
+                                      (size_t)eCombine, 
+                                      (size_t)eReduce, 
+                                      graph, aggregates, stream);
+
+  //Flatened if-then-else solution:
+  //
+     //const size_t M = NrFctrTypes;
+     //const size_t M2 = M*M;
+     //const size_t M3 = M2*M;
+  
+     //size_t i
+     //  = (size_t)vCombine * M3
+     //  + (size_t)vReduce *  M2
+     //  + (size_t)eCombine * M
+     //  + (size_t)eReduce;
+    
+    //return Selector<NComboTypes-1, NrFctrTypes, VectorI, VectorV>::iffer(i, graph, aggregates, stream);
+  }
+
+    template<typename IndexT, typename ValueT, typename T>
+  CsrGraph<IndexT>* contract_from_aggregates_t(CsrGraph<IndexT>& graph, 
+                                             IndexT* p_aggregates,
+                                             size_t n,
+                                             cudaStream_t stream,
+                                             const SemiRingFunctorTypes& vCombine,
+                                             const SemiRingFunctorTypes& vReduce,
+                                             const SemiRingFunctorTypes& eCombine,
+                                             const SemiRingFunctorTypes& eReduce)
+  {
+    typedef thrust::device_vector<IndexT> VectorI;
+    typedef thrust::device_vector<ValueT> VectorV;
+
+    VectorI aggregates(p_aggregates, p_aggregates+n);
+
+  //Nested if-then-else solution:
+  //
+  //(no need for constness, they're NOT template args)
+  //
+  return NestedTypedIfThenElseWrapperT<VectorI, VectorV, T, NrFctrTypes>::iffer((size_t)vCombine, 
+                                      (size_t)vReduce, 
+                                      (size_t)eCombine, 
+                                      (size_t)eReduce, 
+                                      graph, aggregates, stream);
+
+  //Flatened if-then-else solution:
+  //
+     //const size_t M = NrFctrTypes;
+     //const size_t M2 = M*M;
+     //const size_t M3 = M2*M;
+  
+     //size_t i
+     //  = (size_t)vCombine * M3
+     //  + (size_t)vReduce *  M2
+     //  + (size_t)eCombine * M
+     //  + (size_t)eReduce;
+    
+    //return Selector<NComboTypes-1, NrFctrTypes, VectorI, VectorV>::iffer(i, graph, aggregates, stream);
+  }
+
+}
+
+#endif
diff --git a/cpp/nvgraph/cpp/include/graph_utils.cuh b/cpp/nvgraph/cpp/include/graph_utils.cuh
new file mode 100644
index 00000000000..29350213dcf
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/graph_utils.cuh
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Helper functions based on Thrust
+
+
+#pragma once
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+//#include <library_types.h>
+//#include <cuda_fp16.h>
+
+#include <thrust/device_vector.h>
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+#include <thrust/inner_product.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sort.h>
+
+#define USE_CG 1
+#define DEBUG 1
+
+namespace nvlouvain
+{
+
+#define CUDA_MAX_BLOCKS 65535
+#define CUDA_MAX_KERNEL_THREADS 256  //kernel will launch at most 256 threads per block
+#define DEFAULT_MASK 0xffffffff
+#define US
+
+//#define DEBUG 1
+
+//error check
+#undef cudaCheckError 
+#ifdef DEBUG
+  #define WHERE " at: " << __FILE__ << ':' << __LINE__
+  #define cudaCheckError() {                                              \
+    cudaError_t e=cudaGetLastError();                                     \
+    if(e!=cudaSuccess) {                                                  \
+      std::cerr << "Cuda failure: "  << cudaGetErrorString(e) << WHERE << std::endl;        \
+    }                                                                     \
+  }
+#else 
+  #define cudaCheckError()   
+  #define WHERE ""
+#endif 
+
+template<typename T>
+static __device__ __forceinline__ T shfl_up(T r, int offset, int bound = 32, int mask = DEFAULT_MASK)
+{
+    #if __CUDA_ARCH__ >= 300
+    #if USE_CG
+      return __shfl_up_sync( mask, r, offset, bound );
+    #else
+      return __shfl_up( r, offset, bound );
+    #endif
+    #else
+      return 0.0f;
+    #endif
+}
+
+template<typename T>
+static __device__ __forceinline__ T shfl(T r, int lane, int bound = 32, int mask = DEFAULT_MASK)
+    {
+    #if __CUDA_ARCH__ >= 300
+#if USE_CG
+        return __shfl_sync(mask, r, lane, bound );
+#else
+        return __shfl(r, lane, bound );
+#endif
+    #else
+        return 0.0f;
+    #endif
+    }
+
+template<typename T>
+__inline__  __device__
+T parallel_prefix_sum(int n, int *ind,T *w) {
+    int i,j,mn;
+    T v,last;
+    T sum=0.0;
+    bool valid;
+
+    //Parallel prefix sum (using __shfl)
+    mn =(((n+blockDim.x-1)/blockDim.x)*blockDim.x); //n in multiple of blockDim.x
+    for (i=threadIdx.x; i<mn; i+=blockDim.x) {
+        //All threads (especially the last one) must always participate
+        //in the shfl instruction, otherwise their sum will be undefined.
+        //So, the loop stopping condition is based on multiple of n in loop increments,
+        //so that all threads enter into the loop and inside we make sure we do not
+        //read out of bounds memory checking for the actual size n.
+
+        //check if the thread is valid
+        valid  = i<n;
+
+        //Notice that the last thread is used to propagate the prefix sum.
+        //For all the threads, in the first iteration the last is 0, in the following
+        //iterations it is the value at the last thread of the previous iterations.
+
+        //get the value of the last thread
+        last = shfl(sum, blockDim.x-1, blockDim.x);
+
+        //if you are valid read the value from memory, otherwise set your value to 0
+        sum = (valid) ? w[ind[i]] : 0.0;
+
+        //do prefix sum (of size warpSize=blockDim.x =< 32)
+        for (j=1; j<blockDim.x; j*=2) {
+            v = shfl_up(sum, j, blockDim.x);
+            if (threadIdx.x >= j) sum+=v;
+        }
+        //shift by last
+        sum+=last;
+        //notice that no __threadfence or __syncthreads are needed in this implementation
+    }
+    //get the value of the last thread (to all threads)
+    last = shfl(sum, blockDim.x-1, blockDim.x);
+
+    return last;
+}
+
+//dot
+template <typename T>
+T dot(size_t n, T* x, T* y) {
+  T result = thrust::inner_product(thrust::device_pointer_cast(x), 
+                                               thrust::device_pointer_cast(x+n),
+                                               thrust::device_pointer_cast(y), 
+                                               0.0f);
+  cudaCheckError();
+  return result;
+}
+
+//axpy
+template <typename T>
+struct axpy_functor : public thrust::binary_function<T,T,T> {
+  const T a;
+ axpy_functor(T _a) : a(_a) {}
+  __host__ __device__
+  T operator()(const T& x, const T& y) const { 
+      return a * x + y;
+    }
+};
+
+template <typename T>
+void axpy(size_t n, T a,  T* x,  T* y) {
+  thrust::transform(thrust::device_pointer_cast(x), 
+                              thrust::device_pointer_cast(x+n), 
+                              thrust::device_pointer_cast(y), 
+                              thrust::device_pointer_cast(y), 
+                              axpy_functor<T>(a));
+  cudaCheckError();
+}
+
+//norm
+template <typename T>
+struct square {
+  __host__ __device__
+    T operator()(const T& x) const { 
+      return x * x;
+    }
+};
+
+template <typename T>
+T nrm2(size_t n, T* x) {
+  T init = 0;
+  T result = std::sqrt( thrust::transform_reduce(thrust::device_pointer_cast(x), 
+                            thrust::device_pointer_cast(x+n), 
+                            square<T>(), 
+                            init, 
+                            thrust::plus<T>()) );
+  cudaCheckError();
+  return result;
+}
+
+template <typename T>
+T nrm1(size_t n, T* x) {
+    T result = thrust::reduce(thrust::device_pointer_cast(x), thrust::device_pointer_cast(x+n));
+    cudaCheckError();
+    return result;
+}
+
+template <typename T>
+void scal(size_t n, T val, T* x) {
+  thrust::transform(thrust::device_pointer_cast(x),
+                                  thrust::device_pointer_cast(x + n),  
+                                  thrust::make_constant_iterator(val), 
+                                  thrust::device_pointer_cast(x), 
+                                  thrust::multiplies<T>());
+  cudaCheckError();
+}
+
+template <typename T>
+void fill(size_t n, T* x, T value) {
+    thrust::fill(thrust::device_pointer_cast(x), thrust::device_pointer_cast(x + n), value);
+    cudaCheckError();
+}
+
+template <typename T>
+void printv(size_t n, T* vec, int offset) {
+    thrust::device_ptr<T> dev_ptr(vec);
+    std::cout.precision(15);
+    std::cout << "sample size = "<< n << ", offset = "<< offset << std::endl;
+    thrust::copy(dev_ptr+offset,dev_ptr+offset+n, std::ostream_iterator<T>(std::cout, " "));
+    cudaCheckError();
+    std::cout << std::endl;
+}
+
+template<typename T>
+void copy(size_t n, T *x, T *res)
+{
+    thrust::device_ptr<T> dev_ptr(x);
+    thrust::device_ptr<T> res_ptr(res);
+    thrust::copy_n(dev_ptr, n, res_ptr);
+    cudaCheckError();
+}
+
+template <typename T>
+struct is_zero {
+  __host__ __device__
+  bool operator()(const T x) {
+    return x == 0;
+  }
+};
+
+template <typename T>
+struct dangling_functor : public thrust::unary_function<T,T> {
+  const T val;
+  dangling_functor(T _val) : val(_val) {}
+  __host__ __device__
+  T operator()(const T& x) const { 
+      return val + x;
+    }
+};
+
+template <typename T>
+void update_dangling_nodes(size_t n, T* dangling_nodes, T damping_factor) {
+  thrust::transform_if(thrust::device_pointer_cast(dangling_nodes),
+  	thrust::device_pointer_cast( dangling_nodes + n),  
+  	thrust::device_pointer_cast(dangling_nodes), 
+  	dangling_functor<T>(1.0-damping_factor),
+  	is_zero<T>());
+  cudaCheckError();
+}
+
+//google matrix kernels
+template <typename IndexType, typename ValueType>
+__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
+degree_coo ( const  IndexType n, const IndexType e, const IndexType *ind, IndexType *degree) {
+    for (int i=threadIdx.x+blockIdx.x*blockDim.x; i<e; i+=gridDim.x*blockDim.x) 
+        atomicAdd(&degree[ind[i]],1.0);
+}
+template <typename IndexType, typename ValueType>
+__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
+equi_prob ( const  IndexType n, const IndexType e, const IndexType *ind, ValueType *val, IndexType *degree) {
+    for (int i=threadIdx.x+blockIdx.x*blockDim.x; i<e; i+=gridDim.x*blockDim.x) 
+        val[i] = 1.0/degree[ind[i]];
+}
+
+template <typename IndexType, typename ValueType>
+__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
+flag_leafs ( const  IndexType n, IndexType *degree, ValueType *bookmark) {
+    for (int i=threadIdx.x+blockIdx.x*blockDim.x; i<n; i+=gridDim.x*blockDim.x) 
+      if (degree[i]==0)
+        bookmark[i]=1.0;
+}
+//notice that in the transposed matrix/csc a dangling node is a node without incomming edges
+template <typename IndexType, typename ValueType>
+void google_matrix ( const  IndexType n, const IndexType e, const IndexType *cooColInd, ValueType *cooVal, ValueType *bookmark) {
+  thrust::device_vector<IndexType> degree(n,0);
+  dim3 nthreads, nblocks;
+  nthreads.x = min(e,CUDA_MAX_KERNEL_THREADS); 
+  nthreads.y = 1; 
+  nthreads.z = 1;  
+  nblocks.x  = min((e + nthreads.x - 1)/nthreads.x,CUDA_MAX_BLOCKS); 
+  nblocks.y  = 1; 
+  nblocks.z  = 1;
+  degree_coo<IndexType,ValueType><<<nblocks,nthreads>>>(n,e,cooColInd, thrust::raw_pointer_cast(degree.data()));
+  equi_prob<IndexType,ValueType><<<nblocks,nthreads>>>(n,e,cooColInd, cooVal, thrust::raw_pointer_cast(degree.data()));
+  ValueType val = 0.0;
+  fill(n,bookmark,val);
+  nthreads.x = min(n,CUDA_MAX_KERNEL_THREADS); 
+  nblocks.x  = min((n + nthreads.x - 1)/nthreads.x,CUDA_MAX_BLOCKS); 
+  flag_leafs <IndexType,ValueType><<<nblocks,nthreads>>>(n, thrust::raw_pointer_cast(degree.data()), bookmark);
+  //printv(n, thrust::raw_pointer_cast(degree.data()) , 0);
+  //printv(n, bookmark , 0);
+  //printv(e, cooVal , 0);
+}
+
+template <typename IndexType>
+__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
+update_clustering_kernel ( const  IndexType n, IndexType *clustering, IndexType *aggregates_d) {
+    for (int i=threadIdx.x+blockIdx.x*blockDim.x; i<n; i+=gridDim.x*blockDim.x) 
+      clustering[i] = aggregates_d[clustering[i]];
+}
+
+template <typename IndexType>
+void update_clustering ( const  IndexType n, IndexType *clustering, IndexType *aggregates_d) {
+  int nthreads = min(n,CUDA_MAX_KERNEL_THREADS); 
+  int nblocks = min((n + nthreads - 1)/nthreads,CUDA_MAX_BLOCKS); 
+  update_clustering_kernel<IndexType><<<nblocks,nthreads>>>(n,clustering,aggregates_d);
+}
+
+} //namespace nvga
diff --git a/cpp/nvgraph/cpp/include/graph_visitors.hxx b/cpp/nvgraph/cpp/include/graph_visitors.hxx
new file mode 100644
index 00000000000..7c7dd1bf56b
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/graph_visitors.hxx
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GRAPH_VISITORS_HXX
+#define GRAPH_VISITORS_HXX
+
+namespace nvgraph
+{
+  //PROBLEM: using Visitor Design Pattern over a 
+  //         hierarchy of visitees that depend on 
+  //         different number of template arguments
+  //
+  //SOLUTION:use Acyclic Visitor
+  //         (A. Alexandrescu, "Modern C++ Design", Section 10.4), 
+  //         where *concrete* Visitors must be parameterized by all 
+  //         the possibile template args of the Visited classes (visitees);
+  //
+  struct VisitorBase
+  {
+    virtual ~VisitorBase(void)
+    {
+    }
+  };
+
+  template<typename T>
+  struct Visitor
+  {
+    virtual void Visit(T& ) = 0;
+    virtual ~Visitor() { }
+  };
+}//end namespace
+#endif
+
diff --git a/cpp/nvgraph/cpp/include/high_res_clock.h b/cpp/nvgraph/cpp/include/high_res_clock.h
new file mode 100644
index 00000000000..3694feeb44c
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/high_res_clock.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// A wrapper of clock_gettime.
+// Michael A. Frumkin (mfrumkin@nvidia.com)
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <time.h>
+
+class HighResClock {
+ public:
+  HighResClock() {
+    clock_gettime(CLOCK_REALTIME, &_start_time);
+    clock_gettime(CLOCK_REALTIME, &_stop_time);
+  }
+  ~HighResClock() { }
+
+  void start() { clock_gettime(CLOCK_REALTIME, &_start_time); }
+
+  std::string stop() {
+    clock_gettime(CLOCK_REALTIME, &_stop_time);
+    char buffer[64];
+    long long int start_time =
+        _start_time.tv_sec * 1e9 + _start_time.tv_nsec;
+    long long int stop_time =
+        _stop_time.tv_sec * 1e9 + _stop_time.tv_nsec;
+
+    sprintf(buffer, "%lld us", 
+            (stop_time - start_time) / 1000);
+    std::string str(buffer);
+    return str;
+  }
+
+  void stop(double* elapsed_time) {  // returns time in us
+    clock_gettime(CLOCK_REALTIME, &_stop_time);
+    long long int start_time =
+        _start_time.tv_sec * 1e9 + _start_time.tv_nsec;
+    long long int stop_time =
+        _stop_time.tv_sec * 1e9 + _stop_time.tv_nsec;
+    *elapsed_time = (stop_time - start_time) / 1000;
+  }
+
+ private: 
+  timespec _start_time;
+  timespec _stop_time;   
+};
diff --git a/cpp/nvgraph/cpp/include/incidence_graph.hxx b/cpp/nvgraph/cpp/include/incidence_graph.hxx
new file mode 100644
index 00000000000..02fce850c9d
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/incidence_graph.hxx
@@ -0,0 +1,598 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef incidence_graph_hxx
+#define incidence_graph_hxx
+
+#include <iostream>
+#include <vector>
+#include <map>
+#include <iterator>
+#include <algorithm>
+#include <sstream>
+#include <stdexcept>
+#include <cassert>
+
+
+
+#define DEBUG_
+//
+
+namespace nvgraph{
+namespace debug{
+
+typedef std::vector<std::vector<int> > MatrixI;
+
+//IndexT = index type to store in the incidence Matrix
+//VertexT = value type to store for each vertex
+//EdgetT = value type to store for each edge
+//
+//Graph stored by inidence matrix
+//for DEBUGGING purposes, only
+//(of small graphs)
+//
+template<typename IndexT, typename VertexT, typename EdgeT>
+struct Graph
+{
+  typedef IndexT TypeI;
+  typedef VertexT TypeV;
+  typedef EdgeT TypeE;
+    
+  Graph(void): nrows_(0), ncols_(0)
+  {
+  }
+    
+  explicit Graph(const MatrixI& incidence):
+    nrows_(incidence.size()),
+    ncols_(incidence[0].size()),//throws on empty incidence!
+    incidence_(incidence)
+  {
+	//construct the other members?
+  }
+    
+  virtual ~Graph(){}
+
+  void add_vertex(const VertexT& value)
+  {
+	//add row and column:
+	++nrows_;
+	++ncols_;
+
+	for(typename MatrixI::iterator row=incidence_.begin();row!=incidence_.end();++row)
+	  {
+		(*row).push_back(IndexT(0));
+	  }
+        
+	// for(auto& row:incidence_)
+	//   {
+	// 	row.push_back(IndexT(0));
+	//   }
+	incidence_.push_back(std::vector<IndexT>(ncols_,IndexT(0)));
+        
+	vertex_values_.push_back(value);
+  }
+    
+  void add_edge(const EdgeT& value,
+				const std::pair<IndexT,IndexT>& endpoints /*first = source, second=sink*/)
+  {
+	IndexT i = endpoints.first;
+	IndexT j = endpoints.second;
+        
+	incidence_[i][j] = IndexT(1);
+	edge_values_.insert(std::make_pair(endpoints,value));
+  }
+    
+  friend std::ostream& operator<<(std::ostream& os, const Graph& g)
+  {
+	g.print(os);
+        
+	return os;
+  }
+    
+  const MatrixI& get_incidence(void) const
+  {
+    return incidence_;
+  }
+
+  MatrixI& get_incidence(void)
+  {
+    return incidence_;
+  }
+    
+  size_t get_nrows(void) const
+  {
+    return nrows_;
+  }
+
+  size_t& get_nrows(void)
+  {
+    return nrows_;
+  }
+    
+  size_t get_ncols(void) const
+  {
+    return ncols_;
+  }
+
+  size_t& get_ncols(void)
+  {
+    return ncols_;
+  }
+    
+  size_t get_nnz(void) const
+  {
+    return edge_values_.size();
+  }
+    
+  const std::map<std::pair<IndexT, IndexT>, EdgeT>& get_edges(void) const
+  {
+    return edge_values_;
+  }
+    
+  //must be public (for CsrGraph(Graph&))...why?
+  std::map<std::pair<IndexT, IndexT>, EdgeT>& get_edges(void)
+  {
+    return edge_values_;
+  }
+    
+  std::vector<VertexT>& get_vertices(void)
+  {
+    return vertex_values_;
+  }
+    
+protected:
+  struct RowPrinter
+  {
+	explicit RowPrinter(std::ostream& o):
+	  m_os(o)
+	{
+	}
+
+	void operator()(const std::vector<IndexT>& row)
+	{
+	  std::copy(row.begin(), row.end(), std::ostream_iterator<IndexT>(m_os, ","));
+	  m_os<<"\n";
+	}
+  private:
+	std::ostream& m_os;
+  };
+
+  void print_incidence(std::ostream& os) const
+  {
+    os<<"(nr,nc):("<<nrows_<<","<<ncols_<<")\n";
+	
+	RowPrinter rprint(os);
+	std::for_each(incidence_.begin(), incidence_.end(), rprint);
+
+    // std::for_each(incidence_.begin(), incidence_.end(), [&os](const std::vector<IndexT>& row){
+	// 	std::copy(row.begin(), row.end(), std::ostream_iterator<IndexT>(os, ","));
+	// 	os<<"\n";
+    //   });
+  }
+    
+  void print_vertices(std::ostream& os) const
+  {
+    int i=0;
+	for(typename std::vector<VertexT>::const_iterator it=vertex_values_.begin();
+		it!=vertex_values_.end();
+		++it)
+	  {
+		os<<"v["<<i<<"]="<<*it<<",";
+		++i;
+	  }
+
+	
+    // for(auto entry:vertex_values_)
+    //   {
+	// 	os<<"v["<<i<<"]="<<entry<<",";
+	// 	++i;
+    //   }
+
+    os<<"\n";
+  }
+    
+  void print_edges(std::ostream& os) const
+  {        
+
+	for(typename std::map<std::pair<IndexT, IndexT>, EdgeT>::const_iterator it=edge_values_.begin();
+		it!=edge_values_.end();
+		++it)
+	  {
+		os<<"("<<it->first.first<<","<<it->first.second<<")="<<it->second<<",";
+	  }
+
+	  // for(auto entry:edge_values_)
+	  // 	{
+	  // 	  os<<"("<<entry.first.first<<","<<entry.first.second<<")="<<entry.second<<",";
+	  // 	}
+
+    os<<"\n";
+  }
+    
+  virtual void print(std::ostream& os) const
+  {
+    print_incidence(os);
+    print_vertices(os);
+    print_edges(os);
+  }
+private:
+  size_t nrows_;
+  size_t ncols_;
+    
+  MatrixI incidence_;
+  std::vector<VertexT> vertex_values_;
+  std::map<std::pair<IndexT, IndexT>, EdgeT> edge_values_;
+};
+
+//CSR:
+//for matrix A_{mxn} with nnz non-zero entries:
+//
+//vals[nnz]:    contains the non-zero entries in order left-right, top-down;
+//              no entry for rows without non-zeros;
+//row_ptr[m+1]: contains poition in "vals" of first non-zero entry for each row;
+//              last element is nnz;
+//              for empty row i, we repeat info from i+1 in row_ptr
+//cols_ind[nnz]:contains column of each non-zero entry in vals;
+//              no entry for rows without non-zeros;
+/*
+  col_ind[j] and vals[j] for j in [row_ptr[i], row_ptr[i+1]-1] represent the column index (unsigned integer) and value of matrix (double) on row i
+*/
+//
+template<typename IndexT, typename VertexT, typename EdgeT>
+struct CsrGraph: Graph<IndexT, VertexT, EdgeT>
+{
+  using Graph<IndexT, VertexT, EdgeT>::get_incidence;
+  using Graph<IndexT, VertexT, EdgeT>::get_nrows;
+  using Graph<IndexT, VertexT, EdgeT>::get_ncols;
+  using Graph<IndexT, VertexT, EdgeT>::get_nnz;
+  using Graph<IndexT, VertexT, EdgeT>::get_edges;//not confused by 2 versions of it...
+  using Graph<IndexT, VertexT, EdgeT>::get_vertices;
+    
+  CsrGraph(void):Graph<IndexT, VertexT, EdgeT>()
+  {
+  }
+    
+  explicit CsrGraph(Graph<IndexT, VertexT, EdgeT>& g)://g must be non-const...why?
+    Graph<IndexT, VertexT, EdgeT>(g.get_incidence())
+    //,get_edges()(g.get_edges()) //fails to compile in initialization list...why?
+  {
+    get_edges() = g.get_edges();//ok!
+    get_vertices() = g.get_vertices();
+        
+    to_csr();
+  }
+
+  CsrGraph(const std::vector<EdgeT>& vals,
+	   const std::vector<IndexT>& row_ptr,
+	   const std::vector<IndexT>& col_ind,
+	   const std::vector<VertexT>& vertex_values):
+    vals_(vals),
+    row_ptr_(row_ptr),
+    col_ind_(col_ind)
+  {
+    from_csr(vertex_values);
+  }
+
+  void from_csr(const std::vector<VertexT>& vertex_values)
+  {
+    ///size_t nnz = col_ind_.size();
+    size_t nrows = vertex_values.size();
+    get_nrows() = nrows;
+    get_ncols() = nrows;
+
+    get_incidence().assign(nrows,std::vector<IndexT>(nrows,IndexT(0)));
+    get_vertices() = vertex_values;
+      
+    for(IndexT i=IndexT(0);i<IndexT(nrows);++i)
+      {
+		for(IndexT j=row_ptr_[i]; j<row_ptr_[i+1];++j)
+		  {
+			IndexT k = col_ind_[j];
+			EdgeT v = vals_[j];
+	      
+			get_incidence()[i][k] = 1;
+			get_edges().insert(std::make_pair(std::make_pair(i,k),v));
+		  }
+      }
+  }
+    
+  void to_csr(void)
+  {
+    size_t nnz = get_nnz();
+    size_t nrows = get_nrows();
+    size_t ncols = get_ncols();
+    //const auto& edges = get_edges();
+    const std::map<std::pair<IndexT, IndexT>, EdgeT>& edges = get_edges();
+        
+    vals_.assign(nnz,EdgeT());
+    row_ptr_.assign(nrows+1,IndexT(0));
+    row_ptr_[nrows] = IndexT(nnz);
+    col_ind_.assign(nnz,IndexT(0));
+        
+    const MatrixI& A = get_incidence();
+    IndexT crt_row_ptr_i(0);
+    IndexT crt_nz_i(0);
+        
+    std::vector<IndexT> all_zeros;
+    all_zeros.reserve(nrows);
+        
+    for(IndexT i=0;i<nrows;++i)
+      {
+		bool first_nz_inrow = true;
+		for(IndexT j=0;j<ncols;++j)
+		  {
+			if( A[i][j] != IndexT(0) )
+			  {
+				///std::pair<IndexT,IndexT> key(i,j);//ok
+				//std::pair<IndexT,IndexT> key = std::make_pair<IndexT,IndexT>(i, j);//fails...why???
+				//see: http://stackoverflow.com/questions/9641960/c11-make-pair-with-specified-template-parameters-doesnt-compile
+                    
+				std::pair<IndexT,IndexT> key = std::make_pair(i, j);
+                    
+				typename std::map<std::pair<IndexT, IndexT>, EdgeT>::const_iterator pos = edges.find(key);
+				if (pos == edges.end())
+				  {
+					std::stringstream ss;
+					ss << "ERROR: edge("<<i<<","<<j<<") not found.";
+					throw std::runtime_error(ss.str());
+				  }
+				vals_[crt_nz_i] = pos->second;
+                    
+                    
+				if (first_nz_inrow)
+				  {
+					row_ptr_[crt_row_ptr_i] = crt_nz_i;
+					first_nz_inrow = false;
+                        
+					++crt_row_ptr_i;
+				  }
+				col_ind_[crt_nz_i] = j;
+                    
+				++crt_nz_i;
+			  }//end if
+		  }//end for j
+            
+		//special cases of a row with all zeros: mark it!
+		if (first_nz_inrow)
+		  {
+			all_zeros.push_back(i);
+		  }
+      }//end for i
+        
+    //handle all zero row cases:
+    fix_zero_rows(all_zeros, row_ptr_);   
+  }
+    
+  const std::vector<EdgeT>& get_vals(void) const
+  {
+    return vals_;
+  }
+
+  std::vector<EdgeT>& get_vals(void)
+  {
+    return vals_;
+  }
+    
+  const std::vector<IndexT>& get_row_ptr(void) const
+  {
+    return row_ptr_;
+  }
+
+  std::vector<IndexT>& get_row_ptr(void)
+  {
+    return row_ptr_;
+  }
+    
+  const std::vector<IndexT>& get_col_ind(void) const
+  {
+    return col_ind_;
+  }
+
+  std::vector<IndexT>& get_col_ind(void)
+  {
+    return col_ind_;
+  }
+    
+  friend std::ostream& operator<<(std::ostream& os, const CsrGraph& g)
+  {
+    g.Graph<IndexT, VertexT, EdgeT>::print(os);
+    g.print(os);
+        
+    return os;
+  }
+
+  void extract_subgraph(std::vector<IndexT>& vertexSubset, 
+			CsrGraph& subgraph) const
+  {
+    //check if vertexSubset is sorted increasingly:
+    //
+    
+    if( std::adjacent_find(vertexSubset.begin(), vertexSubset.end(), std::greater<IndexT>()) 
+		!= vertexSubset.end() )//not sorted in ascending order...
+      {
+		std::sort(vertexSubset.begin(), vertexSubset.end());
+		//#ifdef DEBUG_
+		std::copy(vertexSubset.begin(), vertexSubset.end(), std::ostream_iterator<IndexT>(std::cout,","));
+		std::cout<<std::endl;
+		//#endif
+      }
+    //#ifdef DEBUG_
+    else
+      {
+		std::cout<<"was sorted...\n";
+      }
+    //#endif
+
+    std::vector<EdgeT>& vals_subg = subgraph.vals_;
+    std::vector<IndexT>& row_ptr_subg = subgraph.row_ptr_;
+    std::vector<IndexT>& col_ind_subg = subgraph.col_ind_;
+
+    std::vector<IndexT> all_zeros;
+
+    IndexT last_updated_pos(0);
+    //
+    size_t nrows_subg = vertexSubset.size();
+
+    row_ptr_subg.assign(nrows_subg+1, IndexT(0));
+    all_zeros.reserve(nrows_subg);
+
+    IndexT nz_subg(0);
+
+    for(IndexT i=IndexT(0);i<IndexT(nrows_subg);++i)
+      {
+		IndexT row_index = vertexSubset[i];
+		bool first_nz_inrow = true;
+
+		for(IndexT j=row_ptr_[row_index]; j<row_ptr_[row_index+1];++j)
+		  {
+			IndexT k = col_ind_[j];
+			if( std::binary_search(vertexSubset.begin(), vertexSubset.end(), k) )//in vertex subset!
+			  {
+				vals_subg.push_back(vals_[j]);
+				col_ind_subg.push_back(k);
+
+				++nz_subg;
+
+				if( first_nz_inrow )//update row_ptr_subg
+				  {
+					row_ptr_subg[i] = last_updated_pos;
+					first_nz_inrow = false;
+				  }
+
+				++last_updated_pos;
+			  }
+		  }//end for(j;..)
+
+		//special cases of a row with all zeros: mark it!
+		if (first_nz_inrow)
+		  {
+			all_zeros.push_back(i);
+		  }
+      }//end for(i;...)
+
+    assert( nz_subg == vals_subg.size() );
+    assert( nz_subg == col_ind_subg.size() );
+    
+    //last entry in row_ptr_subg:
+    row_ptr_subg.back() = nz_subg;
+
+    //handle all zero row cases:
+    fix_zero_rows(all_zeros, row_ptr_subg);    
+
+    remap_indices(vertexSubset, col_ind_subg);
+  }
+
+protected:
+  void print(std::ostream& os) const
+  {
+    os<<"vals: ";
+    std::copy(vals_.begin(), vals_.end(), std::ostream_iterator<EdgeT>(os,","));
+    os<<"\n";
+        
+    os<<"row_ptr: ";
+    std::copy(row_ptr_.begin(), row_ptr_.end(), std::ostream_iterator<IndexT>(os,","));
+    os<<"\n";
+        
+    os<<"col_ind: ";
+    std::copy(col_ind_.begin(), col_ind_.end(), std::ostream_iterator<IndexT>(os,","));
+    os<<"\n";
+  }
+
+  struct Updater
+  {
+	explicit Updater(std::vector<IndexT>& row_ptr):
+	  m_row_ptr(row_ptr)
+	{
+	}
+
+	void operator()(const IndexT& i)
+	{
+	  m_row_ptr[i] = m_row_ptr[i+1];
+	}
+  private:
+	std::vector<IndexT>& m_row_ptr;
+  };
+
+  //correct row_ptr: iterate all_zeros from end towards beginning 
+  //and correct row_ptr_ at corresponding index
+  //
+  static void fix_zero_rows(const std::vector<IndexT>& all_zeros,
+			    std::vector<IndexT>& row_ptr)
+  {
+	Updater up(row_ptr);
+	std::for_each(all_zeros.rbegin(), all_zeros.rend(), up);
+	
+    // std::for_each(all_zeros.rbegin(), all_zeros.rend(), [&](const IndexT& i){
+	// 	row_ptr[i] = row_ptr[i+1];
+    //   });
+  }
+
+  struct HashUpdater
+  {
+	explicit HashUpdater(std::vector<IndexT>& hash):
+	  m_hash(hash),
+	  m_counter(0)
+	{
+	}
+
+	void operator()(const IndexT& i)
+	{
+	  m_hash[i]=m_counter++;
+	}
+  private:
+	std::vector<IndexT>& m_hash;
+	IndexT m_counter;
+  };
+
+  //assumes src is ordered increasingly
+  //
+  static void remap_indices(const std::vector<IndexT>& src, 
+			    std::vector<IndexT>& index_set)
+  {
+    IndexT max_entry = src.back();
+
+    //use hash_src vector as hash-table:
+    //
+    std::vector<IndexT> hash_src(max_entry+1, IndexT(0));
+    ///std::iota(hash_src.begin(), hash_src.end(), IndexT(0));//increasing sequence
+
+	HashUpdater hasher(hash_src);
+	std::for_each(src.begin(), src.end(), hasher);
+
+    // IndexT counter(0);
+    // std::for_each(src.begin(), src.end(), [&](const IndexT& i){
+	// 	hash_src[i]=counter++;
+    //   });
+
+    size_t set_sz = index_set.size();
+    std::vector<IndexT> old_index_set(index_set);
+      
+    for(IndexT k = 0;k<set_sz;++k)
+      {
+		index_set[k] = hash_src[old_index_set[k]];
+      }
+  }
+    
+private:
+  std::vector<EdgeT> vals_;
+  std::vector<IndexT> row_ptr_;
+  std::vector<IndexT> col_ind_;
+};
+
+}//end namespace debug
+}//end namespace nvgraph
+
+#endif /* incidence_graph_hxx */
diff --git a/cpp/nvgraph/cpp/include/jaccard_gpu.cuh b/cpp/nvgraph/cpp/include/jaccard_gpu.cuh
new file mode 100644
index 00000000000..84b16c7c903
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/jaccard_gpu.cuh
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Jaccard symilarity edge weights
+// Author: Alexandre Fender afender@nvidia.com and Maxim Naumov.
+
+#pragma once
+
+namespace nvlouvain 
+{
+template <bool weighted, typename T> 
+int jaccard(int n, int e, int *csrPtr, int *csrInd, T * csrVal, T *v, T *work, T gamma, T *weight_i, T *weight_s, T *weight_j);
+}
diff --git a/cpp/nvgraph/cpp/include/kmeans.hxx b/cpp/nvgraph/cpp/include/kmeans.hxx
new file mode 100644
index 00000000000..386b084706a
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/kmeans.hxx
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "nvgraph_error.hxx"
+
+namespace nvgraph {
+
+  /// Find clusters with k-means algorithm
+  /** Initial centroids are chosen with k-means++ algorithm. Empty
+   *  clusters are reinitialized by choosing new centroids with
+   *  k-means++ algorithm.
+   *
+   *  CNMEM must be initialized before calling this function.
+   *
+   *  @param cublasHandle_t cuBLAS handle.
+   *  @param n Number of observation vectors.
+   *  @param d Dimension of observation vectors.
+   *  @param k Number of clusters.
+   *  @param tol Tolerance for convergence. k-means stops when the
+   *    change in residual divided by n is less than tol.
+   *  @param maxiter Maximum number of k-means iterations.
+   *  @param obs (Input, device memory, d*n entries) Observation
+   *    matrix. Matrix is stored column-major and each column is an
+   *    observation vector. Matrix dimensions are d x n.
+   *  @param codes (Output, device memory, n entries) Cluster
+   *    assignments.
+   *  @param residual On exit, residual sum of squares (sum of squares
+   *    of distances between observation vectors and centroids).
+   *  @param On exit, number of k-means iterations.
+   *  @return NVGRAPH error flag.
+   */
+  template <typename IndexType_, typename ValueType_>
+  NVGRAPH_ERROR kmeans(IndexType_ n, IndexType_ d, IndexType_ k,
+		    ValueType_ tol, IndexType_ maxiter,
+		    const ValueType_ * __restrict__ obs,
+		    IndexType_ * __restrict__ codes,
+		    ValueType_ & residual,
+		    IndexType_ & iters);
+
+  /// Find clusters with k-means algorithm
+  /** Initial centroids are chosen with k-means++ algorithm. Empty
+   *  clusters are reinitialized by choosing new centroids with
+   *  k-means++ algorithm.
+   *
+   *  @param n Number of observation vectors.
+   *  @param d Dimension of observation vectors.
+   *  @param k Number of clusters.
+   *  @param tol Tolerance for convergence. k-means stops when the
+   *    change in residual divided by n is less than tol.
+   *  @param maxiter Maximum number of k-means iterations.
+   *  @param obs (Input, device memory, d*n entries) Observation
+   *    matrix. Matrix is stored column-major and each column is an
+   *    observation vector. Matrix dimensions are d x n.
+   *  @param codes (Output, device memory, n entries) Cluster
+   *    assignments.
+   *  @param clusterSizes (Output, device memory, k entries) Number of
+   *    points in each cluster.
+   *  @param centroids (Output, device memory, d*k entries) Centroid
+   *    matrix. Matrix is stored column-major and each column is a
+   *    centroid. Matrix dimensions are d x k.
+   *  @param work (Output, device memory, n*max(k,d) entries)
+   *    Workspace.
+   *  @param work_int (Output, device memory, 2*d*n entries)
+   *    Workspace.
+   *  @param residual_host (Output, host memory, 1 entry) Residual sum
+   *    of squares (sum of squares of distances between observation
+   *    vectors and centroids).
+   *  @param iters_host (Output, host memory, 1 entry) Number of
+   *    k-means iterations.
+   *  @return NVGRAPH error flag.
+   */
+  template <typename IndexType_, typename ValueType_>
+  NVGRAPH_ERROR kmeans(IndexType_ n, IndexType_ d, IndexType_ k,
+		    ValueType_ tol, IndexType_ maxiter,
+		    const ValueType_ * __restrict__ obs,
+		    IndexType_ * __restrict__ codes,
+		    IndexType_ * __restrict__ clusterSizes,
+		    ValueType_ * __restrict__ centroids,
+		    ValueType_ * __restrict__ work,
+		    IndexType_ * __restrict__ work_int,
+		    ValueType_ * residual_host,
+		    IndexType_ * iters_host);
+
+}
+
diff --git a/cpp/nvgraph/cpp/include/lanczos.hxx b/cpp/nvgraph/cpp/include/lanczos.hxx
new file mode 100644
index 00000000000..9875e1b4f12
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/lanczos.hxx
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ #pragma once
+
+#include "nvgraph_error.hxx"
+#include "matrix.hxx"
+
+namespace nvgraph {
+
+  /// Compute smallest eigenvectors of symmetric matrix
+  /** Computes eigenvalues and eigenvectors that are least
+   *  positive. If matrix is positive definite or positive
+   *  semidefinite, the computed eigenvalues are smallest in
+   *  magnitude.
+   *
+   *  The largest eigenvalue is estimated by performing several
+   *  Lanczos iterations. An implicitly restarted Lanczos method is
+   *  then applied to A+s*I, where s is negative the largest
+   *  eigenvalue.
+   *
+   *  CNMEM must be initialized before calling this function.
+   *
+   *  @param A Pointer to matrix object.
+   *  @param nEigVecs Number of eigenvectors to compute.
+   *  @param maxIter Maximum number of Lanczos steps. Does not include
+   *    Lanczos steps used to estimate largest eigenvalue.
+   *  @param restartIter Maximum size of Lanczos system before
+   *    performing an implicit restart. Should be at least 4.
+   *  @param tol Convergence tolerance. Lanczos iteration will
+   *    terminate when the residual norm is less than tol*theta, where
+   *    theta is an estimate for the smallest unwanted eigenvalue
+   *    (i.e. the (nEigVecs+1)th smallest eigenvalue).
+   *  @param reorthogonalize Whether to reorthogonalize Lanczos
+   *    vectors.
+   *  @param iter On exit, pointer to total number of Lanczos
+   *    iterations performed. Does not include Lanczos steps used to
+   *    estimate largest eigenvalue.
+   *  @param eigVals_dev (Output, device memory, nEigVecs entries)
+   *    Smallest eigenvalues of matrix.
+   *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
+   *    Eigenvectors corresponding to smallest eigenvalues of
+   *    matrix. Vectors are stored as columns of a column-major matrix
+   *    with dimensions n x nEigVecs.
+   *  @return NVGRAPH error flag.
+   */
+  template <typename IndexType_, typename ValueType_>
+  NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix<IndexType_,ValueType_> & A,
+					 IndexType_ nEigVecs,
+					 IndexType_ maxIter,
+					 IndexType_ restartIter,
+					 ValueType_ tol,
+					 bool reorthogonalize,
+					 IndexType_ & iter,
+					 ValueType_ * __restrict__ eigVals_dev,
+					 ValueType_ * __restrict__ eigVecs_dev);
+
+  /// Compute smallest eigenvectors of symmetric matrix
+  /** Computes eigenvalues and eigenvectors that are least
+   *  positive. If matrix is positive definite or positive
+   *  semidefinite, the computed eigenvalues are smallest in
+   *  magnitude.
+   *
+   *  The largest eigenvalue is estimated by performing several
+   *  Lanczos iterations. An implicitly restarted Lanczos method is
+   *  then applied to A+s*I, where s is negative the largest
+   *  eigenvalue.
+   *
+   *  @param A Pointer to matrix object.
+   *  @param nEigVecs Number of eigenvectors to compute.
+   *  @param maxIter Maximum number of Lanczos steps. Does not include
+   *    Lanczos steps used to estimate largest eigenvalue.
+   *  @param restartIter Maximum size of Lanczos system before
+   *    performing an implicit restart. Should be at least 4.
+   *  @param tol Convergence tolerance. Lanczos iteration will
+   *    terminate when the residual norm is less than tol*theta, where
+   *    theta is an estimate for the smallest unwanted eigenvalue
+   *    (i.e. the (nEigVecs+1)th smallest eigenvalue).
+   *  @param reorthogonalize Whether to reorthogonalize Lanczos
+   *    vectors.
+   *  @param iter On exit, pointer to final size of Lanczos system.
+   *  @param totalIter On exit, pointer to total number of Lanczos
+   *    iterations performed. Does not include Lanczos steps used to
+   *    estimate largest eigenvalue.
+   *  @param shift On exit, pointer to matrix shift.
+   *  @param alpha_host (Output, host memory, restartIter entries)
+   *    Diagonal entries of Lanczos system.
+   *  @param beta_host (Output, host memory, restartIter entries)
+   *    Off-diagonal entries of Lanczos system.
+   *  @param lanczosVecs_dev (Output, device memory, n*(restartIter+1)
+   *    entries) Lanczos vectors. Vectors are stored as columns of a
+   *    column-major matrix with dimensions n x (restartIter+1).
+   *  @param work_dev (Output, device memory,
+   *    (n+restartIter)*restartIter entries) Workspace.
+   *  @param eigVals_dev (Output, device memory, nEigVecs entries)
+   *    Smallest eigenvalues of matrix.
+   *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
+   *    Eigenvectors corresponding to smallest eigenvalues of
+   *    matrix. Vectors are stored as columns of a column-major matrix
+   *    with dimensions n x nEigVecs.
+   *  @return NVGRAPH error flag.
+   */
+  template <typename IndexType_, typename ValueType_>
+  NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix<IndexType_,ValueType_> * A,
+					 IndexType_ nEigVecs,
+					 IndexType_ maxIter,
+					 IndexType_ restartIter,
+					 ValueType_ tol,
+					 bool reorthogonalize,
+					 IndexType_ * iter,
+					 IndexType_ * totalIter,
+					 ValueType_ * shift,
+					 ValueType_ * __restrict__ alpha_host,
+					 ValueType_ * __restrict__ beta_host,
+					 ValueType_ * __restrict__ lanczosVecs_dev,
+					 ValueType_ * __restrict__ work_dev,
+					 ValueType_ * __restrict__ eigVals_dev,
+					 ValueType_ * __restrict__ eigVecs_dev);
+
+    /// Compute largest eigenvectors of symmetric matrix
+  /** Computes eigenvalues and eigenvectors that are least
+   *  positive. If matrix is positive definite or positive
+   *  semidefinite, the computed eigenvalues are largest in
+   *  magnitude.
+   *
+   *  The largest eigenvalue is estimated by performing several
+   *  Lanczos iterations. An implicitly restarted Lanczos method is
+   *  then applied.
+   *
+   *  @param A Matrix.
+   *  @param nEigVecs Number of eigenvectors to compute.
+   *  @param maxIter Maximum number of Lanczos steps. 
+   *  @param restartIter Maximum size of Lanczos system before
+   *    performing an implicit restart. Should be at least 4.
+   *  @param tol Convergence tolerance. Lanczos iteration will
+   *    terminate when the residual norm is less than tol*theta, where
+   *    theta is an estimate for the largest unwanted eigenvalue
+   *    (i.e. the (nEigVecs+1)th largest eigenvalue).
+   *  @param reorthogonalize Whether to reorthogonalize Lanczos
+   *    vectors.
+   *  @param effIter On exit, pointer to final size of Lanczos system.
+   *  @param totalIter On exit, pointer to total number of Lanczos
+   *    iterations performed.
+   *  @param alpha_host (Output, host memory, restartIter entries)
+   *    Diagonal entries of Lanczos system.
+   *  @param beta_host (Output, host memory, restartIter entries)
+   *    Off-diagonal entries of Lanczos system.
+   *  @param lanczosVecs_dev (Output, device memory, n*(restartIter+1)
+   *    entries) Lanczos vectors. Vectors are stored as columns of a
+   *    column-major matrix with dimensions n x (restartIter+1).
+   *  @param work_dev (Output, device memory,
+   *    (n+restartIter)*restartIter entries) Workspace.
+   *  @param eigVals_dev (Output, device memory, nEigVecs entries)
+   *    Largest eigenvalues of matrix.
+   *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
+   *    Eigenvectors corresponding to largest eigenvalues of
+   *    matrix. Vectors are stored as columns of a column-major matrix
+   *    with dimensions n x nEigVecs.
+   *  @return NVGRAPH error flag.
+   */
+  template <typename IndexType_, typename ValueType_>
+  NVGRAPH_ERROR computeLargestEigenvectors(const Matrix<IndexType_,ValueType_> * A,
+           IndexType_ nEigVecs,
+           IndexType_ maxIter,
+           IndexType_ restartIter,
+           ValueType_ tol,
+           bool reorthogonalize,
+           IndexType_ * effIter,
+           IndexType_ * totalIter,
+           ValueType_ * __restrict__ alpha_host,
+           ValueType_ * __restrict__ beta_host,
+           ValueType_ * __restrict__ lanczosVecs_dev,
+           ValueType_ * __restrict__ work_dev,
+           ValueType_ * __restrict__ eigVals_dev,
+           ValueType_ * __restrict__ eigVecs_dev);
+
+    /// Compute largest eigenvectors of symmetric matrix
+  /** Computes eigenvalues and eigenvectors that are least
+   *  positive. If matrix is positive definite or positive
+   *  semidefinite, the computed eigenvalues are largest in
+   *  magnitude.
+   *
+   *  The largest eigenvalue is estimated by performing several
+   *  Lanczos iterations. An implicitly restarted Lanczos method is
+   *  then applied to A+s*I, where s is negative the largest
+   *  eigenvalue.
+   *
+   *  CNMEM must be initialized before calling this function.
+   *
+   *  @param A Matrix.
+   *  @param nEigVecs Number of eigenvectors to compute.
+   *  @param maxIter Maximum number of Lanczos steps. Does not include
+   *    Lanczos steps used to estimate largest eigenvalue.
+   *  @param restartIter Maximum size of Lanczos system before
+   *    performing an implicit restart. Should be at least 4.
+   *  @param tol Convergence tolerance. Lanczos iteration will
+   *    terminate when the residual norm is less than tol*theta, where
+   *    theta is an estimate for the largest unwanted eigenvalue
+   *    (i.e. the (nEigVecs+1)th largest eigenvalue).
+   *  @param reorthogonalize Whether to reorthogonalize Lanczos
+   *    vectors.
+   *  @param iter On exit, pointer to total number of Lanczos
+   *    iterations performed. Does not include Lanczos steps used to
+   *    estimate largest eigenvalue.
+   *  @param eigVals_dev (Output, device memory, nEigVecs entries)
+   *    Largest eigenvalues of matrix.
+   *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
+   *    Eigenvectors corresponding to largest eigenvalues of
+   *    matrix. Vectors are stored as columns of a column-major matrix
+   *    with dimensions n x nEigVecs.
+   *  @return NVGRAPH error flag.
+   */
+  template <typename IndexType_, typename ValueType_>
+  NVGRAPH_ERROR computeLargestEigenvectors(const Matrix<IndexType_,ValueType_> & A,
+           IndexType_ nEigVecs,
+           IndexType_ maxIter,
+           IndexType_ restartIter,
+           ValueType_ tol,
+           bool reorthogonalize,
+           IndexType_ & iter,
+           ValueType_ * __restrict__ eigVals_dev,
+           ValueType_ * __restrict__ eigVecs_dev);
+
+}
+
diff --git a/cpp/nvgraph/cpp/include/lobpcg.hxx b/cpp/nvgraph/cpp/include/lobpcg.hxx
new file mode 100755
index 00000000000..b8695802d40
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/lobpcg.hxx
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "matrix.hxx"
+#include "partition.hxx"
+
+namespace nvgraph {
+
+    template <typename IndexType_, typename ValueType_>
+    int lobpcg_simplified(cublasHandle_t cublasHandle, cusolverDnHandle_t cusolverHandle,
+                          IndexType_ n, IndexType_ k,
+                          /*const*/ Matrix<IndexType_,ValueType_> * A,
+                          ValueType_ * __restrict__ eigVecs_dev,
+                          ValueType_ * __restrict__ eigVals_dev,
+                          IndexType_ maxIter,ValueType_ tol,
+                          ValueType_ * __restrict__ work_dev,
+                          IndexType_ & iter);
+
+}
diff --git a/cpp/nvgraph/cpp/include/matrix.hxx b/cpp/nvgraph/cpp/include/matrix.hxx
new file mode 100644
index 00000000000..446f20144e7
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/matrix.hxx
@@ -0,0 +1,789 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda.h>
+#include <cublas_v2.h>
+#include <curand.h>
+#include <cusolverDn.h>
+#include <cusparse.h>
+
+#include "nvgraph_vector.hxx"
+#include "valued_csr_graph.hxx"
+
+namespace nvgraph {
+
+  /// Abstract matrix class
+  /** Derived classes must implement matrix-vector products.
+   */
+  template <typename IndexType_, typename ValueType_>
+  class Matrix {
+  public:
+    /// Number of rows
+    const IndexType_ m;
+    /// Number of columns
+    const IndexType_ n;
+    /// CUDA stream
+    cudaStream_t s;  
+
+    /// Constructor
+    /** @param _m Number of rows.
+     *  @param _n Number of columns.
+     */
+    Matrix(IndexType_ _m, IndexType_ _n) : m(_m), n(_n), s(0){}
+
+    /// Destructor
+    virtual ~Matrix() {}
+
+
+    /// Get and Set CUDA stream  
+    virtual void setCUDAStream(cudaStream_t _s) = 0;  
+    virtual void getCUDAStream(cudaStream_t *_s) = 0;    
+
+    /// Matrix-vector product
+    /** y is overwritten with alpha*A*x+beta*y.
+     *
+     *  @param alpha Scalar.
+     *  @param x (Input, device memory, n entries) Vector.
+     *  @param beta Scalar.
+     *  @param y (Input/output, device memory, m entries) Output
+     *    vector.
+     */
+    virtual void mv(ValueType_ alpha,
+		    const ValueType_ * __restrict__ x,
+		    ValueType_ beta,
+		    ValueType_ * __restrict__ y) const = 0;
+
+    virtual void mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const = 0;  
+    /// Color and Reorder
+    virtual void color(IndexType_ *c, IndexType_ *p) const = 0;  
+    virtual void reorder(IndexType_ *p) const = 0;  
+
+    /// Incomplete Cholesky (setup, factor and solve)
+    virtual void prec_setup(Matrix<IndexType_,ValueType_> * _M) = 0;
+    virtual void prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const = 0; 
+    
+    //Get the sum of all edges
+    virtual ValueType_ getEdgeSum() const = 0;
+  };
+
+  /// Dense matrix class
+  template <typename IndexType_, typename ValueType_>
+  class DenseMatrix : public Matrix<IndexType_, ValueType_> {
+
+  private:
+    /// Whether to transpose matrix
+    const bool trans;
+    /// Matrix entries, stored column-major in device memory
+    const ValueType_ * A;
+    /// Leading dimension of matrix entry array
+    const IndexType_ lda;
+
+  public:
+    /// Constructor
+    DenseMatrix(bool _trans,
+		IndexType_ _m, IndexType_ _n,
+		const ValueType_ * _A, IndexType_ _lda);
+
+    /// Destructor
+    virtual ~DenseMatrix();
+
+    /// Get and Set CUDA stream  
+    virtual void setCUDAStream(cudaStream_t _s);  
+    virtual void getCUDAStream(cudaStream_t *_s);     
+
+    /// Matrix-vector product
+    virtual void mv(ValueType_ alpha, const ValueType_ * __restrict__ x,
+		    ValueType_ beta, ValueType_ * __restrict__ y) const;
+    /// Matrix-set of k vectors product
+    virtual void mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const;  
+
+    /// Color and Reorder
+    virtual void color(IndexType_ *c, IndexType_ *p) const;  
+    virtual void reorder(IndexType_ *p) const;  
+
+    /// Incomplete Cholesky (setup, factor and solve)
+    virtual void prec_setup(Matrix<IndexType_,ValueType_> * _M);
+    virtual void prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const; 
+    
+    //Get the sum of all edges
+    virtual ValueType_ getEdgeSum() const;
+  };
+
+  /// Sparse matrix class in CSR format
+  template <typename IndexType_, typename ValueType_>
+  class CsrMatrix : public Matrix<IndexType_, ValueType_> {
+
+  private:
+    /// Whether to transpose matrix
+    const bool trans;
+    /// Whether matrix is stored in symmetric format
+    const bool sym;
+    /// Number of non-zero entries
+    const IndexType_ nnz;
+    /// Matrix properties
+    const cusparseMatDescr_t descrA;
+    /// Matrix entry values (device memory)
+    /*const*/ ValueType_ * csrValA;
+    /// Pointer to first entry in each row (device memory)
+    const IndexType_ * csrRowPtrA;
+    /// Column index of each matrix entry (device memory)
+    const IndexType_ * csrColIndA;
+    /// Analysis info (pointer to opaque CUSPARSE struct)  
+    cusparseSolveAnalysisInfo_t info_l;
+    cusparseSolveAnalysisInfo_t info_u;  
+    /// factored flag (originally set to false, then reset to true after factorization), 
+    /// notice we only want to factor once
+    bool factored;  
+
+  public:
+    /// Constructor
+    CsrMatrix(bool _trans, bool _sym,
+	      IndexType_ _m, IndexType_ _n, IndexType_ _nnz,
+        const cusparseMatDescr_t _descrA,
+	      /*const*/ ValueType_ * _csrValA,
+	      const IndexType_ * _csrRowPtrA,
+	      const IndexType_ * _csrColIndA);
+
+    /// Constructor
+    CsrMatrix( ValuedCsrGraph<IndexType_,ValueType_> & G, const cusparseMatDescr_t _descrA =0);
+
+    /// Destructor
+    virtual ~CsrMatrix();
+
+    /// Get and Set CUDA stream    
+    virtual void setCUDAStream(cudaStream_t _s);  
+    virtual void getCUDAStream(cudaStream_t *_s);  
+
+
+    /// Matrix-vector product
+    virtual void mv(ValueType_ alpha, const ValueType_ * __restrict__ x,
+		    ValueType_ beta, ValueType_ * __restrict__ y) const;
+    /// Matrix-set of k vectors product
+    virtual void mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const;  
+
+    /// Color and Reorder
+    virtual void color(IndexType_ *c, IndexType_ *p) const;  
+    virtual void reorder(IndexType_ *p) const;  
+
+    /// Incomplete Cholesky (setup, factor and solve)
+    virtual void prec_setup(Matrix<IndexType_,ValueType_> * _M);
+    virtual void prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const;         
+
+    //Get the sum of all edges
+    virtual ValueType_ getEdgeSum() const;
+  };
+
+  /// Graph Laplacian matrix
+  template <typename IndexType_, typename ValueType_>
+  class LaplacianMatrix 
+    : public Matrix<IndexType_, ValueType_> {
+
+  private:
+    /// Adjacency matrix
+    /*const*/ Matrix<IndexType_, ValueType_> * A;
+    /// Degree of each vertex
+    Vector<ValueType_> D;
+    /// Preconditioning matrix
+    Matrix<IndexType_, ValueType_> * M;  
+
+  public:
+    /// Constructor
+    LaplacianMatrix(/*const*/ Matrix<IndexType_,ValueType_> & _A);
+
+    /// Destructor
+    virtual ~LaplacianMatrix();
+
+    /// Get and Set CUDA stream    
+    virtual void setCUDAStream(cudaStream_t _s);  
+    virtual void getCUDAStream(cudaStream_t *_s);   
+
+    /// Matrix-vector product
+    virtual void mv(ValueType_ alpha, const ValueType_ * __restrict__ x,
+		    ValueType_ beta, ValueType_ * __restrict__ y) const;
+     /// Matrix-set of k vectors product
+    virtual void mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const;
+
+    /// Scale a set of k vectors by a diagonal
+    virtual void dm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const;  
+
+    /// Color and Reorder
+    virtual void color(IndexType_ *c, IndexType_ *p) const;  
+    virtual void reorder(IndexType_ *p) const;    
+
+    /// Solve preconditioned system M x = f for a set of k vectors 
+    virtual void prec_setup(Matrix<IndexType_,ValueType_> * _M);
+    virtual void prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const;    
+    
+    //Get the sum of all edges
+    virtual ValueType_ getEdgeSum() const;
+  };
+
+    ///  Modularity matrix
+  template <typename IndexType_, typename ValueType_>
+  class ModularityMatrix 
+    : public Matrix<IndexType_, ValueType_> {
+
+  private:
+    /// Adjacency matrix
+    /*const*/ Matrix<IndexType_, ValueType_> * A;
+    /// Degree of each vertex
+    Vector<ValueType_> D;
+    IndexType_ nnz;
+    ValueType_ edge_sum;
+    
+    /// Preconditioning matrix
+    Matrix<IndexType_, ValueType_> * M;  
+
+  public:
+    /// Constructor
+    ModularityMatrix(/*const*/ Matrix<IndexType_,ValueType_> & _A, IndexType_ _nnz);
+
+    /// Destructor
+    virtual ~ModularityMatrix();
+
+    /// Get and Set CUDA stream    
+    virtual void setCUDAStream(cudaStream_t _s);  
+    virtual void getCUDAStream(cudaStream_t *_s);   
+
+    /// Matrix-vector product
+    virtual void mv(ValueType_ alpha, const ValueType_ * __restrict__ x,
+        ValueType_ beta, ValueType_ * __restrict__ y) const;
+     /// Matrix-set of k vectors product
+    virtual void mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const;
+
+    /// Scale a set of k vectors by a diagonal
+    virtual void dm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const;  
+
+    /// Color and Reorder
+    virtual void color(IndexType_ *c, IndexType_ *p) const;  
+    virtual void reorder(IndexType_ *p) const;    
+
+    /// Solve preconditioned system M x = f for a set of k vectors 
+    virtual void prec_setup(Matrix<IndexType_,ValueType_> * _M);
+    virtual void prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const;    
+   
+    //Get the sum of all edges
+    virtual ValueType_ getEdgeSum() const;
+  };
+
+// cublasIxamax
+inline
+cublasStatus_t cublasIxamax(cublasHandle_t handle, int n,
+          const float *x, int incx, int *result) {
+  return cublasIsamax(handle, n, x, incx, result);
+}
+inline
+cublasStatus_t cublasIxamax(cublasHandle_t handle, int n,
+          const double *x, int incx, int *result) {
+  return cublasIdamax(handle, n, x, incx, result);
+}
+
+// cublasIxamin
+inline
+cublasStatus_t cublasIxamin(cublasHandle_t handle, int n,
+          const float *x, int incx, int *result) {
+  return cublasIsamin(handle, n, x, incx, result);
+}
+inline
+cublasStatus_t cublasIxamin(cublasHandle_t handle, int n,
+          const double *x, int incx, int *result) {
+  return cublasIdamin(handle, n, x, incx, result);
+}
+
+// cublasXasum
+inline
+cublasStatus_t cublasXasum(cublasHandle_t handle, int n,
+         const float *x, int incx,
+         float  *result) {
+  return cublasSasum(handle, n, x, incx, result);
+}
+inline
+cublasStatus_t cublasXasum(cublasHandle_t handle, int n,
+         const double *x, int incx,
+         double  *result) {
+  return cublasDasum(handle, n, x, incx, result);
+}
+
+// cublasXaxpy
+inline
+cublasStatus_t cublasXaxpy(cublasHandle_t handle, int n,
+                           const float * alpha,
+                           const float * x, int incx,
+                           float * y, int incy) {
+  return cublasSaxpy(handle, n, alpha, x, incx, y, incy);
+}
+inline
+cublasStatus_t cublasXaxpy(cublasHandle_t handle, int n,
+                           const double *alpha,
+                           const double *x, int incx,
+                           double *y, int incy) {
+  return cublasDaxpy(handle, n, alpha, x, incx, y, incy);
+}
+
+// cublasXcopy
+inline
+cublasStatus_t cublasXcopy(cublasHandle_t handle, int n,
+                           const float *x, int incx,
+                           float *y, int incy) {
+  return cublasScopy(handle, n, x, incx, y, incy);
+}
+inline
+cublasStatus_t cublasXcopy(cublasHandle_t handle, int n,
+                           const double *x, int incx,
+                           double *y, int incy) {
+  return cublasDcopy(handle, n, x, incx, y, incy);
+}
+
+// cublasXdot
+inline
+cublasStatus_t cublasXdot(cublasHandle_t handle, int n,
+        const float *x, int incx,
+        const float *y, int incy,
+        float *result) {
+  return cublasSdot(handle, n, x, incx, y, incy, result);
+}
+inline
+cublasStatus_t cublasXdot(cublasHandle_t handle, int n,
+        const double *x, int incx,
+        const double *y, int incy,
+        double *result) {
+  return cublasDdot(handle, n, x, incx, y, incy, result);
+}
+
+// cublasXnrm2
+inline
+cublasStatus_t cublasXnrm2(cublasHandle_t handle, int n,
+         const float *x, int incx,
+         float  *result) {
+  return cublasSnrm2(handle, n, x, incx, result);
+}
+inline
+cublasStatus_t cublasXnrm2(cublasHandle_t handle, int n,
+         const double *x, int incx,
+         double  *result) {
+  return cublasDnrm2(handle, n, x, incx, result);
+}
+
+// cublasXscal
+inline
+cublasStatus_t cublasXscal(cublasHandle_t handle, int n,
+         const float *alpha,
+         float *x, int incx) {
+  return cublasSscal(handle, n, alpha, x, incx);
+}
+inline
+cublasStatus_t cublasXscal(cublasHandle_t handle, int n,
+         const double *alpha,
+         double *x, int incx) {
+  return cublasDscal(handle, n, alpha, x, incx);
+}
+
+// cublasXgemv
+inline
+cublasStatus_t cublasXgemv(cublasHandle_t handle,
+         cublasOperation_t trans,
+                           int m, int n,
+                           const float *alpha,
+                           const float *A, int lda,
+                           const float *x, int incx,
+                           const float *beta,
+                           float *y, int incy) {
+  return cublasSgemv(handle, trans, m, n, alpha, A, lda, x, incx,
+         beta, y, incy);
+}
+inline
+cublasStatus_t cublasXgemv(cublasHandle_t handle,
+         cublasOperation_t trans,
+                           int m, int n,
+                           const double *alpha,
+                           const double *A, int lda,
+                           const double *x, int incx,
+                           const double *beta,
+                           double *y, int incy) {
+  return cublasDgemv(handle, trans, m, n, alpha, A, lda, x, incx,
+         beta, y, incy);
+}
+
+// cublasXger
+inline
+cublasStatus_t cublasXger(cublasHandle_t handle, int m, int n,
+        const float *alpha,
+        const float *x, int incx,
+        const float *y, int incy,
+        float *A, int lda) {
+  return cublasSger(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+inline
+cublasStatus_t cublasXger(cublasHandle_t handle, int m, int n,
+        const double *alpha,
+        const double *x, int incx,
+        const double *y, int incy,
+        double *A, int lda) {
+  return cublasDger(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+// cublasXgemm
+inline
+cublasStatus_t cublasXgemm(cublasHandle_t handle,
+         cublasOperation_t transa,
+         cublasOperation_t transb,
+         int m, int n, int k,
+         const float *alpha,
+         const float *A, int lda,
+         const float *B, int ldb,
+         const float *beta,
+         float *C, int ldc) {
+  return cublasSgemm(handle, transa, transb, m, n, k,
+         alpha, A, lda, B, ldb, beta, C, ldc);
+}
+inline
+cublasStatus_t cublasXgemm(cublasHandle_t handle,
+         cublasOperation_t transa,
+         cublasOperation_t transb,
+         int m, int n, int k,
+         const double *alpha,
+         const double *A, int lda,
+         const double *B, int ldb,
+         const double *beta,
+         double *C, int ldc) {
+  return cublasDgemm(handle, transa, transb, m, n, k,
+         alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+// cublasXgeam
+inline
+cublasStatus_t cublasXgeam(cublasHandle_t handle,
+         cublasOperation_t transa,
+         cublasOperation_t transb,
+         int m, int n,
+         const float *alpha,
+         const float *A, int lda,
+         const float *beta,
+         const float *B, int ldb,
+         float *C, int ldc) {
+  return cublasSgeam(handle, transa, transb, m, n,
+         alpha, A, lda, beta, B, ldb, C, ldc);
+}
+inline
+cublasStatus_t cublasXgeam(cublasHandle_t handle,
+         cublasOperation_t transa,
+         cublasOperation_t transb,
+         int m, int n,
+         const double *alpha,
+         const double *A, int lda,
+         const double *beta,
+         const double *B, int ldb,
+         double *C, int ldc) {
+  return cublasDgeam(handle, transa, transb, m, n,
+         alpha, A, lda, beta, B, ldb, C, ldc);
+}
+
+// cublasXtrsm
+inline cublasStatus_t cublasXtrsm(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const float *alpha, const float *A, int lda, float *B, int ldb) {
+    return cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb); 
+}
+inline cublasStatus_t cublasXtrsm(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const double *alpha, const double *A, int lda, double *B, int ldb) {
+    return cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb); 
+}
+
+// curandGeneratorNormalX
+inline 
+curandStatus_t
+curandGenerateNormalX(curandGenerator_t generator,
+          float * outputPtr, size_t n,
+          float mean, float stddev) {
+  return curandGenerateNormal(generator, outputPtr, n, mean, stddev);
+}
+inline
+curandStatus_t
+curandGenerateNormalX(curandGenerator_t generator,
+          double * outputPtr, size_t n,
+          double mean, double stddev) {
+  return curandGenerateNormalDouble(generator, outputPtr,
+            n, mean, stddev);
+}
+
+// cusolverXpotrf_bufferSize
+inline cusolverStatus_t cusolverXpotrf_bufferSize(cusolverDnHandle_t handle, int n, float *A, int lda, int *Lwork){
+    return cusolverDnSpotrf_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,A,lda,Lwork);
+}
+inline cusolverStatus_t cusolverXpotrf_bufferSize(cusolverDnHandle_t handle, int n, double *A, int lda, int *Lwork){
+    return cusolverDnDpotrf_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,A,lda,Lwork);
+}
+
+// cusolverXpotrf
+inline cusolverStatus_t cusolverXpotrf(cusolverDnHandle_t handle, int n, float *A, int lda, float *Workspace, int Lwork, int *devInfo){
+    return cusolverDnSpotrf(handle,CUBLAS_FILL_MODE_LOWER,n,A,lda,Workspace,Lwork,devInfo);
+}
+inline cusolverStatus_t cusolverXpotrf(cusolverDnHandle_t handle, int n, double *A, int lda, double *Workspace, int Lwork, int *devInfo){
+    return cusolverDnDpotrf(handle,CUBLAS_FILL_MODE_LOWER,n,A,lda,Workspace,Lwork,devInfo);
+}
+
+// cusolverXgesvd_bufferSize
+inline cusolverStatus_t cusolverXgesvd_bufferSize(cusolverDnHandle_t handle, int m, int n, float *A, int lda, float *U, int ldu, float *VT, int ldvt, int *Lwork){
+    //ideally
+    //char jobu = 'O';
+    //char jobvt= 'N';
+    //only supported
+    //char jobu = 'A';
+    //char jobvt= 'A';
+    return cusolverDnSgesvd_bufferSize(handle,m,n,Lwork);
+}
+
+inline cusolverStatus_t cusolverXgesvd_bufferSize(cusolverDnHandle_t handle, int m, int n, double *A, int lda, double *U, int ldu, double *VT, int ldvt, int *Lwork){
+    //ideally
+    //char jobu = 'O';
+    //char jobvt= 'N';
+    //only supported
+    //char jobu = 'A';
+    //char jobvt= 'A';
+    return cusolverDnDgesvd_bufferSize(handle,m,n,Lwork);
+}
+
+// cusolverXgesvd
+inline cusolverStatus_t cusolverXgesvd(cusolverDnHandle_t handle, int m, int n, float *A, int lda, float *S, float *U, int ldu, float *VT, int ldvt, float *Work, int Lwork, float *rwork, int  *devInfo){
+    //ideally
+    //char jobu = 'O';
+    //char jobvt= 'N';
+    //only supported
+    char jobu = 'A';
+    char jobvt= 'A';
+
+    return cusolverDnSgesvd(handle,jobu,jobvt,m,n,A,lda,S,U,ldu,VT,ldvt,Work,Lwork,rwork,devInfo);
+} 
+
+inline cusolverStatus_t cusolverXgesvd(cusolverDnHandle_t handle, int m, int n, double *A, int lda, double *S, double *U, int ldu, double *VT, int ldvt, double *Work, int Lwork, double *rwork, int  *devInfo){
+    //ideally
+    //char jobu = 'O';
+    //char jobvt= 'N';
+    //only supported
+    char jobu = 'A';
+    char jobvt= 'A';
+    return cusolverDnDgesvd(handle,jobu,jobvt,m,n,A,lda,S,U,ldu,VT,ldvt,Work,Lwork,rwork,devInfo);
+} 
+
+// cusolverXgesvd_cond
+inline cusolverStatus_t cusolverXgesvd_cond(cusolverDnHandle_t handle, int m, int n, float *A, int lda, float *S, float *U, int ldu, float *VT, int ldvt, float *Work, int Lwork, float *rwork, int  *devInfo){
+    //ideally
+    //char jobu = 'N';
+    //char jobvt= 'N';
+    //only supported
+    char jobu = 'A';
+    char jobvt= 'A';
+    return cusolverDnSgesvd(handle,jobu,jobvt,m,n,A,lda,S,U,ldu,VT,ldvt,Work,Lwork,rwork,devInfo);
+} 
+
+inline cusolverStatus_t cusolverXgesvd_cond(cusolverDnHandle_t handle, int m, int n, double *A, int lda, double *S, double *U, int ldu, double *VT, int ldvt, double *Work, int Lwork, double *rwork, int  *devInfo){
+    //ideally
+    //char jobu = 'N';
+    //char jobvt= 'N';
+    //only supported
+    char jobu = 'A';
+    char jobvt= 'A';
+    return cusolverDnDgesvd(handle,jobu,jobvt,m,n,A,lda,S,U,ldu,VT,ldvt,Work,Lwork,rwork,devInfo);
+} 
+
+// cusparseXcsrmv
+inline
+cusparseStatus_t cusparseXcsrmv(cusparseHandle_t handle,
+        cusparseOperation_t transA, 
+        int m, int n, int nnz,
+        const float * alpha, 
+        const cusparseMatDescr_t descrA, 
+        const float * csrValA, 
+        const int * csrRowPtrA,
+        const int * csrColIndA,
+        const float * x,
+        const float * beta, 
+        float *y) {
+  return cusparseScsrmv_mp(handle, transA, m, n, nnz, 
+      alpha, descrA, csrValA, csrRowPtrA, csrColIndA, 
+      x, beta, y);
+}
+inline
+cusparseStatus_t cusparseXcsrmv(cusparseHandle_t handle,
+        cusparseOperation_t transA, 
+        int m, int n, int nnz,
+        const double * alpha, 
+        const cusparseMatDescr_t descrA, 
+        const double * csrValA, 
+        const int * csrRowPtrA,
+        const int * csrColIndA,
+        const double * x,
+        const double * beta, 
+        double *y) {
+  return cusparseDcsrmv_mp(handle, transA, m, n, nnz,
+        alpha, descrA, csrValA, csrRowPtrA, csrColIndA,
+        x, beta, y);
+}
+
+// cusparseXcsrmm
+inline
+cusparseStatus_t cusparseXcsrmm(cusparseHandle_t handle, 
+        cusparseOperation_t transA, 
+        int m, int n, int k, int nnz, 
+        const float *alpha, 
+        const cusparseMatDescr_t descrA, 
+        const float *csrValA, 
+        const int *csrRowPtrA, 
+        const int *csrColIndA,
+        const float *B, int ldb,
+        const float *beta, 
+        float *C, int ldc) {
+  return cusparseScsrmm(handle, transA, m, n, k, nnz,
+      alpha, descrA, csrValA,
+      csrRowPtrA, csrColIndA,
+      B, ldb, beta, C, ldc);
+}
+inline
+cusparseStatus_t cusparseXcsrmm(cusparseHandle_t handle, 
+        cusparseOperation_t transA, 
+        int m, int n, int k, int nnz, 
+        const double *alpha, 
+        const cusparseMatDescr_t descrA, 
+        const double *csrValA, 
+        const int *csrRowPtrA, 
+        const int *csrColIndA,
+        const double *B, int ldb,
+        const double *beta, 
+        double *C, int ldc) {
+  return cusparseDcsrmm(handle, transA, m, n, k, nnz,
+      alpha, descrA, csrValA,
+      csrRowPtrA, csrColIndA,
+      B, ldb, beta, C, ldc);
+}
+
+// cusparseXcsrgeam
+inline
+cusparseStatus_t cusparseXcsrgeam(cusparseHandle_t handle, 
+          int m, int n,
+          const float *alpha,
+          const cusparseMatDescr_t descrA, 
+          int nnzA, const float *csrValA, 
+          const int *csrRowPtrA, 
+          const int *csrColIndA,
+          const float *beta,
+          const cusparseMatDescr_t descrB, 
+          int nnzB, const float *csrValB, 
+          const int *csrRowPtrB,
+          const int *csrColIndB,
+          const cusparseMatDescr_t descrC,
+          float *csrValC, 
+          int *csrRowPtrC, int *csrColIndC) {
+  return cusparseScsrgeam(handle,m,n,
+        alpha,descrA,nnzA,csrValA,csrRowPtrA,csrColIndA,
+        beta,descrB,nnzB,csrValB,csrRowPtrB,csrColIndB,
+        descrC,csrValC,csrRowPtrC,csrColIndC);
+}
+inline
+cusparseStatus_t cusparseXcsrgeam(cusparseHandle_t handle, 
+          int m, int n,
+          const double *alpha,
+          const cusparseMatDescr_t descrA, 
+          int nnzA, const double *csrValA, 
+          const int *csrRowPtrA, 
+          const int *csrColIndA,
+          const double *beta,
+          const cusparseMatDescr_t descrB, 
+          int nnzB, const double *csrValB, 
+          const int *csrRowPtrB,
+          const int *csrColIndB,
+          const cusparseMatDescr_t descrC,
+          double *csrValC, 
+          int *csrRowPtrC, int *csrColIndC) {
+  return cusparseDcsrgeam(handle,m,n,
+        alpha,descrA,nnzA,csrValA,csrRowPtrA,csrColIndA,
+        beta,descrB,nnzB,csrValB,csrRowPtrB,csrColIndB,
+        descrC,csrValC,csrRowPtrC,csrColIndC);
+}
+
+//ILU0, incomplete-LU with 0 threshhold (CUSPARSE)
+inline cusparseStatus_t cusparseXcsrilu0(cusparseHandle_t handle, 
+                                         cusparseOperation_t trans, 
+                                         int m, 
+                                         const cusparseMatDescr_t descrA, 
+                                         float *csrValM,
+                                         const int *csrRowPtrA, 
+                                         const int *csrColIndA,
+                                         cusparseSolveAnalysisInfo_t info){
+    return cusparseScsrilu0(handle,trans,m,descrA,csrValM,csrRowPtrA,csrColIndA,info);
+}
+
+inline cusparseStatus_t cusparseXcsrilu0(cusparseHandle_t handle, 
+                                         cusparseOperation_t trans, 
+                                         int m, 
+                                         const cusparseMatDescr_t descrA, 
+                                         double *csrValM, 
+                                         const int *csrRowPtrA, 
+                                         const int *csrColIndA, 
+                                         cusparseSolveAnalysisInfo_t info){
+    return cusparseDcsrilu0(handle,trans,m,descrA,csrValM,csrRowPtrA,csrColIndA,info);
+}
+
+//IC0, incomplete-Cholesky with 0 threshhold (CUSPARSE)
+inline cusparseStatus_t cusparseXcsric0(cusparseHandle_t handle, 
+                                        cusparseOperation_t trans, 
+                                        int m, 
+                                        const cusparseMatDescr_t descrA, 
+                                        float *csrValM,
+                                        const int *csrRowPtrA, 
+                                        const int *csrColIndA,
+                                        cusparseSolveAnalysisInfo_t info){
+    return cusparseScsric0(handle,trans,m,descrA,csrValM,csrRowPtrA,csrColIndA,info);
+}
+inline cusparseStatus_t cusparseXcsric0(cusparseHandle_t handle, 
+                                        cusparseOperation_t trans, 
+                                        int m, 
+                                        const cusparseMatDescr_t descrA, 
+                                        double *csrValM, 
+                                        const int *csrRowPtrA, 
+                                        const int *csrColIndA, 
+                                        cusparseSolveAnalysisInfo_t info){
+    return cusparseDcsric0(handle,trans,m,descrA,csrValM,csrRowPtrA,csrColIndA,info);
+}
+
+//sparse triangular solve (CUSPARSE)
+//analysis phase
+inline cusparseStatus_t cusparseXcsrsm_analysis (cusparseHandle_t handle, cusparseOperation_t transa, int m, int nnz, const cusparseMatDescr_t descra, 
+                                                   const float *a, const int *ia, const int *ja, cusparseSolveAnalysisInfo_t info){
+    return cusparseScsrsm_analysis(handle,transa,m,nnz,descra,a,ia,ja,info);
+}   
+inline cusparseStatus_t cusparseXcsrsm_analysis (cusparseHandle_t handle, cusparseOperation_t transa, int m, int nnz, const cusparseMatDescr_t descra, 
+                                                   const double *a, const int *ia, const int *ja, cusparseSolveAnalysisInfo_t info){
+    return cusparseDcsrsm_analysis(handle,transa,m,nnz,descra,a,ia,ja,info);
+} 
+//solve phase
+inline cusparseStatus_t cusparseXcsrsm_solve (cusparseHandle_t handle, cusparseOperation_t transa, int m, int k, float alpha, const cusparseMatDescr_t descra, 
+                                              const float *a, const int *ia, const int *ja, cusparseSolveAnalysisInfo_t info, const float *x, int ldx, float *y, int ldy){
+    return cusparseScsrsm_solve(handle,transa,m,k,&alpha,descra,a,ia,ja,info,x,ldx,y,ldy);
+}   
+inline cusparseStatus_t cusparseXcsrsm_solve (cusparseHandle_t handle, cusparseOperation_t transa, int m, int k, double alpha, const cusparseMatDescr_t descra, 
+                                              const double *a, const int *ia, const int *ja, cusparseSolveAnalysisInfo_t info, const double *x, int ldx, double *y, int ldy){
+    return cusparseDcsrsm_solve(handle,transa,m,k,&alpha,descra,a,ia,ja,info,x,ldx,y,ldy);
+} 
+
+
+inline cusparseStatus_t cusparseXcsrcolor(cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA, const float *csrValA, const int *csrRowPtrA, const int *csrColIndA, const float *fractionToColor, int *ncolors, int *coloring, int *reordering,cusparseColorInfo_t info) {
+    return cusparseScsrcolor(handle,m,nnz,descrA,csrValA,csrRowPtrA,csrColIndA,fractionToColor,ncolors,coloring,reordering,info);
+}
+inline cusparseStatus_t cusparseXcsrcolor(cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA, const double *csrValA, const int *csrRowPtrA, const int *csrColIndA, const double *fractionToColor, int *ncolors, int *coloring, int *reordering,cusparseColorInfo_t info) {
+    return cusparseDcsrcolor(handle,m,nnz,descrA,csrValA,csrRowPtrA,csrColIndA,fractionToColor,ncolors,coloring,reordering,info);
+}
+
+
+}
+
diff --git a/cpp/nvgraph/cpp/include/modularity.cuh b/cpp/nvgraph/cpp/include/modularity.cuh
new file mode 100644
index 00000000000..49917ce30d7
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/modularity.cuh
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/random.h>
+#include <thrust/generate.h>
+#include <thrust/transform.h>
+
+#include "util.cuh"
+#include "graph_utils.cuh"
+#include "functor.cuh"
+//#include "block_modulariy.cuh"
+
+
+namespace nvlouvain{
+/*************************************************************
+*
+*  compute k vector from [ k0, k1, ..., kn ]
+*
+*  - input :
+*     n_vertex
+*     csr_ptr's iterator 
+*     csr_val's iterator
+*
+*  - output:
+*     results: k_vec : k vectors
+*
+***************************************************************/
+template<typename ValType, typename IdxType>
+__device__ void compute_k_vec(const int n_vertex, IdxType* csr_ptr_ptr, ValType* csr_val_ptr, bool weighted, ValType* k_vec){
+
+  int tid = blockDim.x*blockIdx.x + threadIdx.x;
+
+  if( (tid < n_vertex) ){
+
+    int start_idx = *(csr_ptr_ptr + tid);
+    int end_idx = *(csr_ptr_ptr + tid + 1);
+
+#ifdef DEBUG
+    if( end_idx > (*(csr_ptr_ptr + n_vertex)) ){
+      printf("Error computing ki iter but end_idx >= n_vertex %d >= %d\n");
+      *(k_vec + tid) = 0.0;
+    }
+#endif
+
+    if(!weighted){
+      *(k_vec + tid) = (ValType)end_idx - start_idx;
+    }
+    else{
+      ValType sum = 0.0;    
+#pragma unroll 
+      for(int i = 0 ; i < end_idx - start_idx; ++ i){
+        sum += *(csr_val_ptr + start_idx + i);
+      }    
+      *(k_vec + tid) = sum;
+    }
+  }
+  return; 
+}
+
+template<typename IdxType, typename ValType> 
+__device__ void
+modularity_i( const int n_vertex, 
+              const int n_clusters,
+              IdxType* csr_ptr_ptr, 
+              IdxType* csr_ind_ptr, 
+              ValType* csr_val_ptr, 
+              IdxType* cluster_ptr, 
+              IdxType* cluster_inv_ptr_ptr,
+              IdxType* cluster_inv_ind_ptr,
+              ValType* k_ptr,
+              ValType* Q_arr, 
+              ValType* temp_i, // size = n_edges
+              ValType m2
+              ){
+
+  int i = blockIdx.x * blockDim.x + threadIdx.x; 
+  IdxType start_idx, end_idx, c_i; 
+  ValType ki(0.0), Ai(0.0), sum_k(0.0);
+  IdxType start_c_idx;
+  IdxType end_c_idx;
+
+  if(i < n_vertex){
+    start_idx = *( csr_ptr_ptr + i );
+    end_idx   = *( csr_ptr_ptr + i + 1 );
+
+    c_i = *(cluster_ptr + i); 
+    ki = *(k_ptr + i);
+
+    //only sees its neibors
+    Ai = 0.0;
+#pragma unroll 
+    for(int j = 0; j< end_idx - start_idx; ++j){ 
+      IdxType j_idx = (IdxType)(*(csr_ind_ptr + j + start_idx));
+      IdxType c_j = (IdxType)(*(cluster_ptr + j_idx));
+      Ai += ((int)(c_i != c_j)*((ValType)(*(csr_val_ptr + j + start_idx))));
+    }
+    
+    
+    start_c_idx = *(cluster_inv_ptr_ptr + c_i);
+    end_c_idx = *(cluster_inv_ptr_ptr + c_i + 1); 
+ 
+
+#ifdef DEBUG
+    if (temp_i == NULL) printf("Error in allocate temp_i memory in thread %d\n",i);
+#endif
+
+#pragma unroll
+    for(int j = 0; j< end_c_idx-start_c_idx; ++j){
+      IdxType j_idx = (IdxType)(*(cluster_inv_ind_ptr + j + start_c_idx));
+      sum_k += (ValType)(*(k_ptr + j_idx)); 
+    }     
+
+    sum_k = m2 - sum_k;    
+    *(Q_arr + i) =( Ai - (( ki * sum_k )/ m2))/m2 ;
+//      printf("-- i: %d Q: %.6e Ai: %f ki*sum_k = %f x %f = %f\n", i, *(Q_arr + i), Ai, ki, sum_k, (ki * sum_k));
+
+  }
+  return;
+}
+
+
+
+template<typename IdxType=int, typename ValType> 
+__device__ void
+modularity_no_matrix(const int n_vertex, const int n_clusters, ValType m2, 
+                     IdxType* csr_ptr_ptr, IdxType* csr_ind_ptr, ValType* csr_val_ptr, 
+                     IdxType* cluster_ptr, IdxType* cluster_inv_ptr_ptr, IdxType* cluster_inv_ind_ptr,
+                     bool weighted, // bool identical_cluster, // todo  optimizaiton
+                     ValType* k_vec, 
+                     ValType* Q_arr, 
+                     ValType* temp_i){
+
+
+  compute_k_vec(n_vertex, csr_ptr_ptr, csr_val_ptr, weighted, k_vec);
+  __syncthreads(); 
+
+  modularity_i(n_vertex, n_clusters, 
+               csr_ptr_ptr, csr_ind_ptr, csr_val_ptr, 
+               cluster_ptr, cluster_inv_ptr_ptr, cluster_inv_ind_ptr, 
+               k_vec, Q_arr, temp_i, m2);
+
+} 
+
+
+
+template<typename IdxType, typename ValType>
+__global__ void 
+kernel_modularity_no_matrix(const int n_vertex, const int n_clusters, ValType m2,
+                            IdxType* csr_ptr_ptr, IdxType* csr_ind_ptr, ValType* csr_val_ptr, 
+                            IdxType* cluster_ptr, IdxType* cluster_inv_ptr_ptr, IdxType* cluster_inv_ind_ptr,
+                            bool weighted, ValType* k_vec_ptr, ValType* Q_arr_ptr, ValType* temp_i_ptr){
+  ValType m2_s(m2);
+  modularity_no_matrix(n_vertex, n_clusters, m2_s, 
+                       csr_ptr_ptr, csr_ind_ptr, csr_val_ptr, 
+                       cluster_ptr, cluster_inv_ptr_ptr, cluster_inv_ind_ptr,
+                       weighted, k_vec_ptr, Q_arr_ptr, temp_i_ptr );
+
+}
+
+template<typename IdxType, typename ValType>
+ValType 
+modularity(const int n_vertex, int n_edges, const int n_clusters, ValType m2,
+           IdxType* csr_ptr_ptr, IdxType* csr_ind_ptr, ValType* csr_val_ptr,
+           IdxType* cluster_ptr, IdxType* cluster_inv_ptr_ptr, IdxType* cluster_inv_ind_ptr,
+           bool weighted, ValType* k_vec_ptr, 
+           ValType* Q_arr_ptr, ValType* temp_i_ptr // temporary space for calculation
+           ){
+
+  thrust::fill(thrust::device, temp_i_ptr, temp_i_ptr + n_edges, 0.0);
+
+  int nthreads = min(n_vertex,CUDA_MAX_KERNEL_THREADS); 
+  int nblocks = min((n_vertex + nthreads - 1)/nthreads,CUDA_MAX_BLOCKS); 
+  kernel_modularity_no_matrix<<<nblocks, nthreads >>>(n_vertex, n_clusters, m2,
+                                                          csr_ptr_ptr, csr_ind_ptr, csr_val_ptr,
+                                                          cluster_ptr, cluster_inv_ptr_ptr, cluster_inv_ind_ptr,
+                                                          weighted, k_vec_ptr, Q_arr_ptr, temp_i_ptr);
+
+  CUDA_CALL(cudaDeviceSynchronize());
+
+  ValType Q = thrust::reduce(thrust::cuda::par, Q_arr_ptr, Q_arr_ptr + n_vertex, (ValType)(0.0)); 
+
+  return -Q;
+
+} 
+
+/***********************
+cluster_iter(n_vertex)
+cluster_inv_ptr(c_size + 1)
+cluster_inv_ind(n_vertex)
+seq_idx(n_vertex) [0, 1, 2, ... , n_vertex -1] 
+***********************/
+template<typename IdxIter, typename IdxType=int> 
+__global__ void
+generate_cluster_inv_ptr(const int n_vertex, const int c_size, IdxIter cluster_iter, IdxType* cluster_inv_ptr){
+  int tid = blockDim.x * blockIdx.x + threadIdx.x; 
+  IdxType ci;
+  //Inital cluster_inv_ptr outside!!!
+
+  if(tid < n_vertex){
+    ci = *(cluster_iter + tid);
+    atomicAdd(cluster_inv_ptr + ci, 1);
+  }
+}
+
+
+template<typename IdxType=int, typename IdxIter> 
+void
+generate_cluster_inv(const int n_vertex, const int c_size, 
+                    IdxIter cluster_iter, 
+                    thrust::device_vector<IdxType>& cluster_inv_ptr, 
+                    thrust::device_vector<IdxType>& cluster_inv_ind){
+
+  int nthreads = min(n_vertex,CUDA_MAX_KERNEL_THREADS); 
+  int nblocks = min((n_vertex + nthreads - 1)/nthreads,CUDA_MAX_BLOCKS); 
+  thrust::fill(thrust::cuda::par, cluster_inv_ptr.begin(), cluster_inv_ptr.end(), 0);
+  cudaCheckError();
+  IdxType* cluster_inv_ptr_ptr = thrust::raw_pointer_cast(cluster_inv_ptr.data());
+
+  generate_cluster_inv_ptr<<<nblocks,nthreads>>>(n_vertex, c_size, cluster_iter, cluster_inv_ptr_ptr);
+  CUDA_CALL(cudaDeviceSynchronize());
+
+#ifdef DEBUG
+  if((unsigned)c_size + 1 > cluster_inv_ptr.size())
+    std::cout<<"Error cluster_inv_ptr run out of memory\n";
+#endif
+
+  thrust::exclusive_scan(thrust::device, cluster_inv_ptr.begin(), cluster_inv_ptr.begin() + c_size + 1 , cluster_inv_ptr.begin());
+  cudaCheckError();
+
+  thrust::sequence(thrust::device, cluster_inv_ind.begin(), cluster_inv_ind.end(), 0); 
+  cudaCheckError();
+  thrust::sort(thrust::device, cluster_inv_ind.begin(), cluster_inv_ind.begin() + n_vertex, sort_by_cluster<IdxType, IdxIter>(cluster_iter));
+  cudaCheckError();  
+  
+}
+
+
+}// nvlouvain
diff --git a/cpp/nvgraph/cpp/include/modularity_maximization.hxx b/cpp/nvgraph/cpp/include/modularity_maximization.hxx
new file mode 100644
index 00000000000..94e66be69ff
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/modularity_maximization.hxx
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "nvgraph_error.hxx"
+#include "valued_csr_graph.hxx"
+#include "matrix.hxx"
+
+
+namespace nvgraph {
+  /** Compute partition for a weighted undirected graph. This
+   *  partition attempts to minimize the cost function:
+   *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+   *
+   *  @param G Weighted graph in CSR format
+   *  @param nClusters Number of partitions.
+   *  @param nEigVecs Number of eigenvectors to compute.
+   *  @param maxIter_lanczos Maximum number of Lanczos iterations.
+   *  @param restartIter_lanczos Maximum size of Lanczos system before
+   *    implicit restart.
+   *  @param tol_lanczos Convergence tolerance for Lanczos method.
+   *  @param maxIter_kmeans Maximum number of k-means iterations.
+   *  @param tol_kmeans Convergence tolerance for k-means algorithm.
+   *  @param parts (Output, device memory, n entries) Cluster
+   *    assignments.
+   *  @param iters_lanczos On exit, number of Lanczos iterations
+   *    performed.
+   *  @param iters_kmeans On exit, number of k-means iterations
+   *    performed.
+   *  @return NVGRAPH error flag.
+   */
+  template <typename IndexType_, typename ValueType_>
+  NVGRAPH_ERROR modularity_maximization( ValuedCsrGraph<IndexType_,ValueType_>& G,
+           IndexType_ nClusters,
+           IndexType_ nEigVecs,
+           IndexType_ maxIter_lanczos,
+           IndexType_ restartIter_lanczos,
+           ValueType_ tol_lanczos,
+           IndexType_ maxIter_kmeans,
+           ValueType_ tol_kmeans,
+           IndexType_ * __restrict__ clusters,
+           Vector<ValueType_> &eigVals,
+           Vector<ValueType_> &eigVecs,
+           IndexType_ & iters_lanczos,
+           IndexType_ & iters_kmeans) ;
+
+
+  /// Compute modularity
+  /** This function determines the modularity based on a graph and cluster assignments 
+   *  @param G Weighted graph in CSR format
+   *  @param nClusters Number of clusters.
+   *  @param parts (Input, device memory, n entries) Cluster assignments.
+   *  @param modularity On exit, modularity
+   */
+ template <typename IndexType_, typename ValueType_>
+  NVGRAPH_ERROR analyzeModularity(ValuedCsrGraph<IndexType_,ValueType_> & G,
+            IndexType_ nClusters,
+            const IndexType_ * __restrict__ parts,
+            ValueType_ & modularity) ;
+
+}
+
diff --git a/cpp/nvgraph/cpp/include/multi_valued_csr_graph.hxx b/cpp/nvgraph/cpp/include/multi_valued_csr_graph.hxx
new file mode 100644
index 00000000000..2af20f252af
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/multi_valued_csr_graph.hxx
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "csr_graph.hxx"
+#include "valued_csr_graph.hxx"
+#include <vector>
+
+namespace nvgraph
+{
+
+template <typename IndexType_, typename ValueType_>
+class MultiValuedCsrGraph : public nvgraph::CsrGraph<IndexType_>
+{
+public:
+    typedef IndexType_ IndexType;
+    typedef ValueType_ ValueType;
+private:
+    typedef nvgraph::CsrGraph<IndexType> Parent;
+
+protected:
+    /*! Storage for the nonzero entries of the multi CSR data structure.
+     */
+    //std::vector <nvgraph::Vector<ValueType>*> values_dim;
+    //std::vector <nvgraph::Vector<ValueType>*> vertex_dim;
+
+    std::vector <SHARED_PREFIX::shared_ptr<nvgraph::Vector<ValueType> > > values_dim;
+    std::vector <SHARED_PREFIX::shared_ptr<nvgraph::Vector<ValueType> > > vertex_dim;
+public:
+
+    /*! Storage for the nonzero entries of the Multi-CSR data structure.*/
+    MultiValuedCsrGraph(void) {}
+    ~MultiValuedCsrGraph(void) 
+    {
+       //for (int i = 0; i < n_vertex_dim; ++i)
+       //    if (vertex_dim[i]) 
+       //        delete vertex_dim[i]; 
+       // for (int i = 0; i < n_edges_dim; ++i)
+       //    if (values_dim[i])
+       //        delete values_dim[i];
+    }
+
+    /*! Construct a \p MultiValuedCsrGraph with a specific shape and number of nonzero entries.
+     *
+     *  \param num_rows Number of rows.
+     *  \param num_entries Number of nonzero graph entries.
+     *  \param num_dimensions Number of dimensions (ie. number of values arrays).
+     */
+    MultiValuedCsrGraph(size_t num_rows, size_t num_entries, cudaStream_t stream)
+    : Parent(num_rows, num_entries, stream) { }
+
+    /*! Construct a \p MultiValuedCsrGraph from another graph.*/
+    MultiValuedCsrGraph(const MultiValuedCsrGraph& gr)
+    :   Parent(gr),
+        values_dim(gr.values_dim),
+        vertex_dim(gr.vertex_dim)
+
+    {}
+    MultiValuedCsrGraph(const Parent& gr)
+    :   Parent(gr)
+    {}
+
+    inline void allocateVertexData(size_t v_dim, cudaStream_t stream) 
+    {
+        vertex_dim.resize(v_dim);
+        for (size_t i = 0; i < vertex_dim.size(); ++i)
+          vertex_dim[i] = SHARED_PREFIX::shared_ptr<nvgraph::Vector<ValueType> >(new Vector<ValueType>(this->num_vertices, stream)); 
+    }
+
+    inline void allocateEdgeData(size_t edges_dim, cudaStream_t stream) 
+    {
+        values_dim.resize(edges_dim);
+         for (size_t i = 0; i < values_dim.size(); ++i)
+           values_dim[i] = SHARED_PREFIX::shared_ptr<nvgraph::Vector<ValueType> >(new Vector<ValueType>(this->num_edges, stream)); 
+    }
+
+    inline void attachVertexData(size_t i, ValueType* data, cudaStream_t stream) 
+    {
+        if (vertex_dim.size() <= i)
+            vertex_dim.resize(i+1);
+         vertex_dim[i] = SHARED_PREFIX::shared_ptr<nvgraph::Vector<ValueType> >(new Vector<ValueType>(this->num_vertices, data, stream)); 
+    }
+
+    inline void attachEdgeData(size_t i, ValueType* data, cudaStream_t stream) 
+    {
+         if (values_dim.size() <= i)
+            values_dim.resize(i+1);
+        values_dim[i] = SHARED_PREFIX::shared_ptr<nvgraph::Vector<ValueType> >(new Vector<ValueType>(this->num_edges, data, stream)); 
+    }
+    
+    inline size_t getNumValues() {
+   	 return values_dim.size();
+    }
+
+    inline size_t get_num_vertex_dim() const { return vertex_dim.size(); }
+    inline size_t get_num_edge_dim() const { return values_dim.size(); }
+    inline Vector<ValueType>& get_vertex_dim(size_t v_dim)  { return *vertex_dim[v_dim]; }
+    inline Vector<ValueType>& get_edge_dim(size_t e_dim)  { return *values_dim[e_dim]; }
+    inline ValueType* get_raw_vertex_dim(size_t v_dim)  { return vertex_dim[v_dim]->raw(); }
+    inline ValueType* get_raw_edge_dim(size_t e_dim)  { return values_dim[e_dim]->raw(); }
+    inline const Vector<ValueType>& get_vertex_dim(size_t v_dim) const  { return *vertex_dim[v_dim]; }
+    inline const Vector<ValueType>& get_edge_dim(size_t e_dim) const { return *values_dim[e_dim]; }
+    inline const ValueType* get_raw_vertex_dim(size_t v_dim) const { return vertex_dim[v_dim]->raw(); }
+    inline const ValueType* get_raw_edge_dim(size_t e_dim) const { return values_dim[e_dim]->raw(); }
+    /*! Extract a \p ValuedCsrGraph from a given dimension of the \p MultiValuedCsrGraph 
+     *  \param dim_index Wanted dimension of the \p MultiValuedCsrGraph 
+     */
+    ValuedCsrGraph<IndexType, ValueType>* get_valued_csr_graph(const size_t dim_index)
+    {
+        //ValuedCsrGraph<IndexType, ValueType> *v = new ValuedCsrGraph<IndexType, ValueType>(static_cast<nvgraph::CsrGraph<IndexType> >(*this), *values_dim[dim_index]);
+        //return *v;
+      
+        //SHARED_PREFIX::shared_ptr<ValuedCsrGraph<IndexType, ValueType> > svcsr = SHARED_PREFIX::shared_ptr<ValuedCsrGraph<IndexType, ValueType> >(new ValuedCsrGraph<IndexType, ValueType>(static_cast<nvgraph::CsrGraph<IndexType> >(*this), *values_dim[dim_index]));
+        //return svcsr; //segfaults
+
+        ///return ValuedCsrGraph<IndexType, ValueType>(static_cast<nvgraph::CsrGraph<IndexType> >(*this), *values_dim[dim_index]);//segfaults
+        ValuedCsrGraph<IndexType, ValueType>* pvcsr = new ValuedCsrGraph<IndexType, ValueType>(static_cast<nvgraph::CsrGraph<IndexType> >(*this), *values_dim[dim_index]);
+        return pvcsr;
+    }
+
+
+
+    /*! Assignment from another MultiValuedCsrGraph graph.
+     *
+     *  \param graph Another MultiValuedCsrGraph
+     */
+    MultiValuedCsrGraph& operator=(const MultiValuedCsrGraph& graph);
+   
+
+    //RESIZE: We should try not to resize MULTI CSR graphs in general for performance reasons
+
+    // SET 
+    //Set should be done in a safe way in the API 
+    // it is possible to use a cudaMemcpy like : cudaMemcpy(G.get_raw_vertex_dim(1), v_h,           
+    //                                           (size_t)(n*sizeof(v_h[0])),           
+    //                                            cudaMemcpyHostToDevice);
+    
+    //Accept method injection
+    DEFINE_VISITABLE(IndexType_)
+
+}; // class MultiValuedCsrGraph
+}
+
diff --git a/cpp/nvgraph/cpp/include/nvgraph.h b/cpp/nvgraph/cpp/include/nvgraph.h
new file mode 100644
index 00000000000..f51daf68b0a
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/nvgraph.h
@@ -0,0 +1,517 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _NVGRAPH_H_
+#define _NVGRAPH_H_
+
+#include "stddef.h"
+#include "stdint.h"
+
+#include "library_types.h"
+
+
+#define NVG_CUDA_TRY(T) {\
+                         if (T != cudaSuccess)\
+                             return NVGRAPH_STATUS_ALLOC_FAILED;\
+		     }
+
+#ifndef NVGRAPH_API
+#ifdef _WIN32
+#define NVGRAPH_API __stdcall
+#else
+#define NVGRAPH_API
+#endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	/* nvGRAPH status type returns */
+	typedef enum
+	{
+		NVGRAPH_STATUS_SUCCESS = 0,
+		NVGRAPH_STATUS_NOT_INITIALIZED = 1,
+		NVGRAPH_STATUS_ALLOC_FAILED = 2,
+		NVGRAPH_STATUS_INVALID_VALUE = 3,
+		NVGRAPH_STATUS_ARCH_MISMATCH = 4,
+		NVGRAPH_STATUS_MAPPING_ERROR = 5,
+		NVGRAPH_STATUS_EXECUTION_FAILED = 6,
+		NVGRAPH_STATUS_INTERNAL_ERROR = 7,
+		NVGRAPH_STATUS_TYPE_NOT_SUPPORTED = 8,
+		NVGRAPH_STATUS_NOT_CONVERGED = 9,
+		NVGRAPH_STATUS_GRAPH_TYPE_NOT_SUPPORTED = 10
+
+	} nvgraphStatus_t;
+
+	const char* nvgraphStatusGetString(nvgraphStatus_t status);
+
+	/* Opaque structure holding nvGRAPH library context */
+	struct nvgraphContext;
+	typedef struct nvgraphContext *nvgraphHandle_t;
+
+	/* Opaque structure holding the graph descriptor */
+	struct nvgraphGraphDescr;
+	typedef struct nvgraphGraphDescr *nvgraphGraphDescr_t;
+
+	/* Semi-ring types */
+	typedef enum
+	{
+		NVGRAPH_PLUS_TIMES_SR = 0,
+		NVGRAPH_MIN_PLUS_SR = 1,
+		NVGRAPH_MAX_MIN_SR = 2,
+		NVGRAPH_OR_AND_SR = 3,
+	} nvgraphSemiring_t;
+
+	/* Topology types */
+	typedef enum
+	{
+		NVGRAPH_CSR_32 = 0,
+		NVGRAPH_CSC_32 = 1,
+		NVGRAPH_COO_32 = 2,
+		NVGRAPH_2D_32I_32I = 3,
+		NVGRAPH_2D_64I_32I = 4
+	} nvgraphTopologyType_t;
+
+	typedef enum
+	{
+		NVGRAPH_DEFAULT = 0,  // Default is unsorted.
+		NVGRAPH_UNSORTED = 1,  //
+		NVGRAPH_SORTED_BY_SOURCE = 2,  // CSR
+		NVGRAPH_SORTED_BY_DESTINATION = 3   // CSC
+	} nvgraphTag_t;
+
+	typedef enum
+	{
+		NVGRAPH_MULTIPLY = 0,
+		NVGRAPH_SUM = 1,
+		NVGRAPH_MIN = 2,
+		NVGRAPH_MAX = 3
+	} nvgraphSemiringOps_t;
+
+	typedef enum
+	{
+		NVGRAPH_MODULARITY_MAXIMIZATION = 0, //maximize modularity with Lanczos solver
+		NVGRAPH_BALANCED_CUT_LANCZOS = 1, //minimize balanced cut with Lanczos solver
+		NVGRAPH_BALANCED_CUT_LOBPCG = 2 //minimize balanced cut with LOPCG solver
+	} nvgraphSpectralClusteringType_t;
+
+	struct SpectralClusteringParameter {
+		int n_clusters; //number of clusters
+		int n_eig_vects; // //number of eigenvectors
+		nvgraphSpectralClusteringType_t algorithm; // algorithm to use
+		float evs_tolerance; // tolerance of the eigensolver
+		int evs_max_iter; // maximum number of iterations of the eigensolver
+		float kmean_tolerance; // tolerance of kmeans
+		int kmean_max_iter; // maximum number of iterations of kemeans
+		void * opt; // optional parameter that can be used for preconditioning in the future
+	};
+
+	typedef enum
+	{
+		NVGRAPH_MODULARITY, // clustering score telling how good the clustering is compared to random assignment.
+		NVGRAPH_EDGE_CUT,  // total number of edges between clusters.
+		NVGRAPH_RATIO_CUT // sum for all clusters of the number of edges going outside of the cluster divided by the number of vertex inside the cluster
+	} nvgraphClusteringMetric_t;
+
+	struct nvgraphCSRTopology32I_st {
+		int nvertices; // n+1
+		int nedges; // nnz
+		int *source_offsets; // rowPtr
+		int *destination_indices; // colInd
+	};
+	typedef struct nvgraphCSRTopology32I_st *nvgraphCSRTopology32I_t;
+
+	struct nvgraphCSCTopology32I_st {
+		int nvertices; // n+1
+		int nedges; // nnz
+		int *destination_offsets; // colPtr
+		int *source_indices; // rowInd
+	};
+	typedef struct nvgraphCSCTopology32I_st *nvgraphCSCTopology32I_t;
+
+	struct nvgraphCOOTopology32I_st {
+		int nvertices; // n+1
+		int nedges; // nnz
+		int *source_indices; // rowInd
+		int *destination_indices; // colInd
+		nvgraphTag_t tag;
+	};
+	typedef struct nvgraphCOOTopology32I_st *nvgraphCOOTopology32I_t;
+
+	struct nvgraph2dCOOTopology32I_st {
+		int nvertices;
+		int nedges;
+		int *source_indices; 			// Row Indices
+		int *destination_indices;	// Column Indices
+		cudaDataType_t valueType;	// The type of values being given.
+		void *values;					// Pointer to array of values.
+		int numDevices; 				// Gives the number of devices to be used.
+		int *devices; 					// Array of device IDs to use.
+		int blockN; 						// Specifies the value of n for an n x n matrix decomposition.
+		nvgraphTag_t tag;
+	};
+	typedef struct nvgraph2dCOOTopology32I_st *nvgraph2dCOOTopology32I_t;
+
+	/* Return properties values for the nvGraph library, such as library version */
+	nvgraphStatus_t NVGRAPH_API nvgraphGetProperty(libraryPropertyType type, int *value);
+
+	/* Open the library and create the handle */
+	nvgraphStatus_t NVGRAPH_API nvgraphCreate(nvgraphHandle_t *handle);
+	nvgraphStatus_t NVGRAPH_API nvgraphCreateMulti(	nvgraphHandle_t *handle,
+																	int numDevices,
+																	int* devices);
+
+	/*  Close the library and destroy the handle  */
+	nvgraphStatus_t NVGRAPH_API nvgraphDestroy(nvgraphHandle_t handle);
+
+	/* Create an empty graph descriptor */
+	nvgraphStatus_t NVGRAPH_API nvgraphCreateGraphDescr(	nvgraphHandle_t handle,
+																			nvgraphGraphDescr_t *descrG);
+
+	/* Destroy a graph descriptor */
+	nvgraphStatus_t NVGRAPH_API nvgraphDestroyGraphDescr(	nvgraphHandle_t handle,
+																			nvgraphGraphDescr_t descrG);
+
+	/* Set size, topology data in the graph descriptor  */
+	nvgraphStatus_t NVGRAPH_API nvgraphSetGraphStructure(	nvgraphHandle_t handle,
+																			nvgraphGraphDescr_t descrG,
+																			void* topologyData,
+																			nvgraphTopologyType_t TType);
+
+	/* Query size and topology information from the graph descriptor */
+	nvgraphStatus_t NVGRAPH_API nvgraphGetGraphStructure(	nvgraphHandle_t handle,
+																			nvgraphGraphDescr_t descrG,
+																			void* topologyData,
+																			nvgraphTopologyType_t* TType);
+
+	/* Allocate numsets vectors of size V representing Vertex Data and attached them the graph.
+	 * settypes[i] is the type of vector #i, currently all Vertex and Edge data should have the same type */
+	nvgraphStatus_t NVGRAPH_API nvgraphAllocateVertexData(nvgraphHandle_t handle,
+																			nvgraphGraphDescr_t descrG,
+																			size_t numsets,
+																			cudaDataType_t *settypes);
+
+	/* Allocate numsets vectors of size E representing Edge Data and attached them the graph.
+	 * settypes[i] is the type of vector #i, currently all Vertex and Edge data should have the same type */
+	nvgraphStatus_t NVGRAPH_API nvgraphAllocateEdgeData(	nvgraphHandle_t handle,
+																			nvgraphGraphDescr_t descrG,
+																			size_t numsets,
+																			cudaDataType_t *settypes);
+
+	/* Update the vertex set #setnum with the data in *vertexData, sets have 0-based index
+	 *  Conversions are not supported so nvgraphTopologyType_t should match the graph structure */
+	nvgraphStatus_t NVGRAPH_API nvgraphSetVertexData(	nvgraphHandle_t handle,
+																		nvgraphGraphDescr_t descrG,
+																		void *vertexData,
+																		size_t setnum);
+
+	/* Copy the edge set #setnum in *edgeData, sets have 0-based index
+	 *  Conversions are not supported so nvgraphTopologyType_t should match the graph structure */
+	nvgraphStatus_t NVGRAPH_API nvgraphGetVertexData(	nvgraphHandle_t handle,
+																		nvgraphGraphDescr_t descrG,
+																		void *vertexData,
+																		size_t setnum);
+
+	/* Convert the edge data to another topology
+	 */
+	nvgraphStatus_t NVGRAPH_API nvgraphConvertTopology(nvgraphHandle_t handle,
+																		nvgraphTopologyType_t srcTType,
+																		void *srcTopology,
+																		void *srcEdgeData,
+																		cudaDataType_t *dataType,
+																		nvgraphTopologyType_t dstTType,
+																		void *dstTopology,
+																		void *dstEdgeData);
+
+	/* Convert graph to another structure
+	 */
+	nvgraphStatus_t NVGRAPH_API nvgraphConvertGraph(nvgraphHandle_t handle,
+																	nvgraphGraphDescr_t srcDescrG,
+																	nvgraphGraphDescr_t dstDescrG,
+																	nvgraphTopologyType_t dstTType);
+
+	/* Update the edge set #setnum with the data in *edgeData, sets have 0-based index
+	 */
+	nvgraphStatus_t NVGRAPH_API nvgraphSetEdgeData(	nvgraphHandle_t handle,
+																	nvgraphGraphDescr_t descrG,
+																	void *edgeData,
+																	size_t setnum);
+
+	/* Copy the edge set #setnum in *edgeData, sets have 0-based index
+	 */
+	nvgraphStatus_t NVGRAPH_API nvgraphGetEdgeData(	nvgraphHandle_t handle,
+																	nvgraphGraphDescr_t descrG,
+																	void *edgeData,
+																	size_t setnum);
+
+	/* create a new graph by extracting a subgraph given a list of vertices
+	 */
+	nvgraphStatus_t NVGRAPH_API nvgraphExtractSubgraphByVertex(	nvgraphHandle_t handle,
+																					nvgraphGraphDescr_t descrG,
+																					nvgraphGraphDescr_t subdescrG,
+																					int *subvertices,
+																					size_t numvertices);
+	/* create a new graph by extracting a subgraph given a list of edges
+	 */
+	nvgraphStatus_t NVGRAPH_API nvgraphExtractSubgraphByEdge(nvgraphHandle_t handle,
+																				nvgraphGraphDescr_t descrG,
+																				nvgraphGraphDescr_t subdescrG,
+																				int *subedges,
+																				size_t numedges);
+
+	/* nvGRAPH Semi-ring sparse matrix vector multiplication
+	 */
+	nvgraphStatus_t NVGRAPH_API nvgraphSrSpmv(nvgraphHandle_t handle,
+															const nvgraphGraphDescr_t descrG,
+															const size_t weight_index,
+															const void *alpha,
+															const size_t x_index,
+															const void *beta,
+															const size_t y_index,
+															const nvgraphSemiring_t SR);
+
+	/* Helper struct for Traversal parameters
+	 */
+	typedef struct {
+		size_t pad[128];
+	} nvgraphTraversalParameter_t;
+
+	/* Initializes traversal parameters with default values
+	 */
+	nvgraphStatus_t NVGRAPH_API nvgraphTraversalParameterInit(nvgraphTraversalParameter_t *param);
+
+	/* Stores/retrieves index of a vertex data where target distances will be stored
+	 */
+	nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetDistancesIndex(	nvgraphTraversalParameter_t *param,
+																						const size_t value);
+
+	nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetDistancesIndex(	const nvgraphTraversalParameter_t param,
+																						size_t *value);
+
+	/* Stores/retrieves index of a vertex data where path predecessors will be stored
+	 */
+	nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetPredecessorsIndex(	nvgraphTraversalParameter_t *param,
+																							const size_t value);
+
+	nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetPredecessorsIndex(	const nvgraphTraversalParameter_t param,
+																							size_t *value);
+
+	/* Stores/retrieves index of an edge data which tells traversal algorithm whether path can go through an edge or not
+	 */
+	nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetEdgeMaskIndex(	nvgraphTraversalParameter_t *param,
+																						const size_t value);
+
+	nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetEdgeMaskIndex(	const nvgraphTraversalParameter_t param,
+																						size_t *value);
+
+	/* Stores/retrieves flag that tells an algorithm whether the graph is directed or not
+	 */
+	nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetUndirectedFlag(	nvgraphTraversalParameter_t *param,
+																						const size_t value);
+
+	nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetUndirectedFlag(	const nvgraphTraversalParameter_t param,
+																						size_t *value);
+
+	/* Stores/retrieves 'alpha' and 'beta' parameters for BFS traversal algorithm
+	 */
+	nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetAlpha(	nvgraphTraversalParameter_t *param,
+																			const size_t value);
+
+	nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetAlpha(	const nvgraphTraversalParameter_t param,
+																			size_t *value);
+
+	nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetBeta(	nvgraphTraversalParameter_t *param,
+																			const size_t value);
+
+	nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetBeta(	const nvgraphTraversalParameter_t param,
+																			size_t *value);
+
+//Traversal available
+	typedef enum {
+		NVGRAPH_TRAVERSAL_BFS = 0
+	} nvgraphTraversal_t;
+
+	/* nvGRAPH Traversal API
+	 * Compute a traversal of the graph from a single vertex using algorithm specified by traversalT parameter
+	 */
+	nvgraphStatus_t NVGRAPH_API nvgraphTraversal(nvgraphHandle_t handle,
+																const nvgraphGraphDescr_t descrG,
+																const nvgraphTraversal_t traversalT,
+																const int *source_vert,
+																const nvgraphTraversalParameter_t params);
+
+	/**
+	 * CAPI Method for calling 2d BFS algorithm.
+	 * @param handle Nvgraph context handle.
+	 * @param descrG Graph handle (must be 2D partitioned)
+	 * @param source_vert The source vertex ID
+	 * @param distances Pointer to memory allocated to store the distances.
+	 * @param predecessors Pointer to memory allocated to store the predecessors
+	 * @return Status code.
+	 */
+	nvgraphStatus_t NVGRAPH_API nvgraph2dBfs(	nvgraphHandle_t handle,
+															const nvgraphGraphDescr_t descrG,
+															const int32_t source_vert,
+															int32_t* distances,
+															int32_t* predecessors);
+
+	/* nvGRAPH Single Source Shortest Path (SSSP)
+	 * Calculate the shortest path distance from a single vertex in the graph to all other vertices.
+	 */
+	nvgraphStatus_t NVGRAPH_API nvgraphSssp(nvgraphHandle_t handle,
+															const nvgraphGraphDescr_t descrG,
+															const size_t weight_index,
+															const int *source_vert,
+															const size_t sssp_index);
+
+	/* nvGRAPH WidestPath
+	 * Find widest path potential from source_index to every other vertices.
+	 */
+	nvgraphStatus_t NVGRAPH_API nvgraphWidestPath(nvgraphHandle_t handle,
+																	const nvgraphGraphDescr_t descrG,
+																	const size_t weight_index,
+																	const int *source_vert,
+																	const size_t widest_path_index);
+
+	/* nvGRAPH PageRank
+	 * Find PageRank for each vertex of a graph with a given transition probabilities, a bookmark vector of dangling vertices, and the damping factor.
+	 */
+	nvgraphStatus_t NVGRAPH_API nvgraphPagerank(nvgraphHandle_t handle,
+																const nvgraphGraphDescr_t descrG,
+																const size_t weight_index,
+																const void *alpha,
+																const size_t bookmark_index,
+																const int has_guess,
+																const size_t pagerank_index,
+																const float tolerance,
+																const int max_iter);
+
+	/* nvGRAPH contraction
+	 * given array of agregates contract graph with
+	 * given (Combine, Reduce) operators for Vertex Set
+	 * and Edge Set;
+	 */
+	nvgraphStatus_t NVGRAPH_API nvgraphContractGraph(nvgraphHandle_t handle,
+																		nvgraphGraphDescr_t descrG,
+																		nvgraphGraphDescr_t contrdescrG,
+																		int *aggregates,
+																		size_t numaggregates,
+																		nvgraphSemiringOps_t VertexCombineOp,
+																		nvgraphSemiringOps_t VertexReduceOp,
+																		nvgraphSemiringOps_t EdgeCombineOp,
+																		nvgraphSemiringOps_t EdgeReduceOp,
+																		int flag);
+
+	/* nvGRAPH spectral clustering
+	 * given a graph and solver parameters of struct SpectralClusteringParameter,
+	 * assign vertices to groups such as
+	 * intra-group connections are strong and/or inter-groups connections are weak
+	 * using spectral technique.
+	 */
+	nvgraphStatus_t NVGRAPH_API nvgraphSpectralClustering(nvgraphHandle_t handle,
+																			const nvgraphGraphDescr_t graph_descr,
+																			const size_t weight_index,
+																			const struct SpectralClusteringParameter *params,
+																			int* clustering,
+																			void* eig_vals,
+																			void* eig_vects);
+
+	/* nvGRAPH analyze clustering
+	 * Given a graph, a clustering, and a metric
+	 * compute the score that measures the clustering quality according to the metric.
+	 */
+	nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeClustering(nvgraphHandle_t handle,
+																			const nvgraphGraphDescr_t graph_descr,
+																			const size_t weight_index,
+																			const int n_clusters,
+																			const int* clustering,
+																			nvgraphClusteringMetric_t metric,
+																			float * score);
+
+	/* nvGRAPH Triangles counting
+	 * count number of triangles (cycles of size 3) formed by graph edges
+	 */
+	nvgraphStatus_t NVGRAPH_API nvgraphTriangleCount(nvgraphHandle_t handle,
+																		const nvgraphGraphDescr_t graph_descr,
+																		uint64_t* result);
+
+        /* nvGRAPH Louvain implementation
+        */
+        nvgraphStatus_t NVGRAPH_API nvgraphLouvain (												cudaDataType_t index_type, 
+																		cudaDataType_t val_type, 
+																		const size_t num_vertex, 
+																		const size_t num_edges,
+                            															void* csr_ptr, 
+																		void* csr_ind, 
+																		void* csr_val, 
+																		int weighted, 
+																		int has_init_cluster, 
+																		void* init_cluster,
+                            															void* final_modularity, 
+																		void* best_cluster_vec, 
+																		void* num_level);
+
+
+       /* nvGRAPH Jaccard implementation
+       */
+       nvgraphStatus_t NVGRAPH_API nvgraphJaccard (												cudaDataType_t index_type, 
+																		cudaDataType_t val_type, 
+																		const size_t n,
+                            															const size_t e, 
+																		void* csr_ptr, 
+																		void *csr_ind, 
+																		void* csr_val, 
+																		int weighted, 
+																		void* v, 
+																		void* gamma, 
+																		void* weight_j);
+
+	/* nvGRAPH attach structure
+	 * Warp external device data into a nvgraphGraphDescr_t
+	 * Warning : this data remain owned by the user
+	 */
+	nvgraphStatus_t NVGRAPH_API nvgraphAttachGraphStructure(nvgraphHandle_t handle,
+															nvgraphGraphDescr_t descrG,
+															void* topologyData,
+															nvgraphTopologyType_t TT);
+
+	/* nvGRAPH attach Vertex Data
+	 * Warp external device data into a vertex dim
+	 * Warning : this data remain owned by the user
+	 */
+	nvgraphStatus_t NVGRAPH_API nvgraphAttachVertexData(nvgraphHandle_t handle,
+														 nvgraphGraphDescr_t descrG,
+														 size_t setnum,
+														 cudaDataType_t settype,
+														 void *vertexData);
+
+	/* nvGRAPH attach Edge Data
+	 * Warp external device data into an edge dim
+	 * Warning : this data remain owned by the user
+	 */
+	nvgraphStatus_t NVGRAPH_API nvgraphAttachEdgeData(nvgraphHandle_t handle,
+												      nvgraphGraphDescr_t descrG,
+												      size_t setnum,
+												      cudaDataType_t settype,
+												      void *edgeData);
+
+#if defined(__cplusplus)
+} /* extern "C" */
+#endif
+
+#endif /* _NVGRAPH_H_ */
+
diff --git a/cpp/nvgraph/cpp/include/nvgraphP.h b/cpp/nvgraph/cpp/include/nvgraphP.h
new file mode 100644
index 00000000000..8e6080e874d
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/nvgraphP.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* 
+ *
+ *
+ * WARNING: this is a private header file, it should not be publically exposed.
+ *
+ *
+ */
+
+#pragma once
+#include "nvgraph.h"
+#include "cnmem.h"
+
+#if defined(__cplusplus) 
+  extern "C" {
+#endif
+
+/* Graph descriptor types */
+typedef enum
+{
+   IS_EMPTY = 0, //nothing
+   HAS_TOPOLOGY = 1, //connectivity info
+   HAS_VALUES = 2, //MultiValuedCSRGraph
+   IS_2D = 3
+} nvgraphGraphStatus_t;
+
+struct nvgraphContext {
+   cudaStream_t stream;
+   cnmemDevice_t cnmem_device;  
+   int nvgraphIsInitialized;  
+};
+
+struct nvgraphGraphDescr {
+   nvgraphGraphStatus_t graphStatus;
+   cudaDataType T;							// This is the type of values for the graph
+   nvgraphTopologyType_t TT;				// The topology type (class to cast graph_handle pointer to)
+   void* graph_handle;						// Opaque pointer to the graph class object
+};
+
+#if defined(__cplusplus) 
+}//extern "C"
+#endif
+
diff --git a/cpp/nvgraph/cpp/include/nvgraph_convert.hxx b/cpp/nvgraph/cpp/include/nvgraph_convert.hxx
new file mode 100644
index 00000000000..f0c5620e7e7
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/nvgraph_convert.hxx
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ #pragma once
+
+#include <nvgraph.h>
+#include <nvgraph_cusparse.hxx>
+#include <cnmem_shared_ptr.hxx>
+
+namespace nvgraph{
+  void csr2coo( const int *csrSortedRowPtr,
+                int nnz, int m,
+                int *cooRowInd,
+                cusparseIndexBase_t idxBase);
+  void coo2csr( const int *cooRowInd,
+                int nnz, int m,
+                int *csrSortedRowPtr,
+                cusparseIndexBase_t idxBase );
+
+  void csr2csc( int m, int n, int nnz,
+                const void *csrVal, const int *csrRowPtr, const int *csrColInd,
+                void *cscVal, int *cscRowInd, int *cscColPtr,
+                cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
+                cudaDataType_t *dataType);
+  void csc2csr( int m, int n, int nnz,
+                const void *cscVal, const int *cscRowInd, const int *cscColPtr,
+                void *csrVal, int *csrRowPtr, int *csrColInd,
+                cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
+                cudaDataType_t *dataType);
+
+  void csr2cscP( int m, int n, int nnz,
+                 const int *csrRowPtr, const int *csrColInd,
+                 int *cscRowInd, int *cscColPtr, int *p, cusparseIndexBase_t idxBase);
+
+
+  void cooSortBySource(int m, int n, int nnz,
+            const void *srcVal, const int *srcRowInd, const int *srcColInd,
+            void *dstVal, int *dstRowInd, int *dstColInd,
+            cusparseIndexBase_t idxBase, cudaDataType_t *dataType);
+  void cooSortByDestination(int m, int n, int nnz,
+            const void *srcVal, const int *srcRowInd, const int *srcColInd,
+            void *dstVal, int *dstRowInd, int *dstColInd,
+            cusparseIndexBase_t idxBase, cudaDataType_t *dataType);
+
+  void coos2csc(int m, int n, int nnz,
+            const void *srcVal, const int *srcRowInd, const int *srcColInd,
+            void *dstVal, int *dstRowInd, int *dstColInd,
+            cusparseIndexBase_t idxBase, cudaDataType_t *dataType);
+  void cood2csr(int m, int n, int nnz,
+            const void *srcVal, const int *srcRowInd, const int *srcColInd,
+            void *dstVal, int *dstRowInd, int *dstColInd,
+            cusparseIndexBase_t idxBase, cudaDataType_t *dataType);
+  void coou2csr(int m, int n, int nnz,
+            const void *srcVal, const int *srcRowInd, const int *srcColInd,
+            void *dstVal, int *dstRowInd, int *dstColInd,
+            cusparseIndexBase_t idxBase, cudaDataType_t *dataType);
+  void coou2csc(int m, int n, int nnz,
+            const void *srcVal, const int *srcRowInd, const int *srcColInd,
+            void *dstVal, int *dstRowInd, int *dstColInd,
+            cusparseIndexBase_t idxBase, cudaDataType_t *dataType);
+
+  ////////////////////////// Utility functions //////////////////////////
+  void createIdentityPermutation(int n, int *p);
+  void gthrX(int nnz, const void *y, void *xVal, const int *xInd,
+    cusparseIndexBase_t idxBase, cudaDataType_t *dataType);
+
+  void cooSortBufferSize(int m, int n, int nnz, const int *cooRows, const int *cooCols, size_t *pBufferSizeInBytes);
+  void cooGetSourcePermutation(int m, int n, int nnz, int *cooRows, int *cooCols, int *p, void *pBuffer);
+  void cooGetDestinationPermutation(int m, int n, int nnz, int *cooRows, int *cooCols, int *p, void *pBuffer);
+
+  void csr2csc2BufferSize(int m, int n, int nnz, const int *csrRowPtr, const int *csrColInd, size_t *pBufferSize);
+  void csr2csc2(int m, int n, int nnz,
+    const int *csrRowPtr, const int *csrColInd,
+    int *cscRowInd, int *cscColPtr, int *p, void *pBuffer,
+    cusparseIndexBase_t idxBase);
+
+} //end nvgraph namespace
diff --git a/cpp/nvgraph/cpp/include/nvgraph_csrmv.hxx b/cpp/nvgraph/cpp/include/nvgraph_csrmv.hxx
new file mode 100644
index 00000000000..d85dda06943
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/nvgraph_csrmv.hxx
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <algorithm>
+#include <stdio.h>
+#include "valued_csr_graph.hxx"
+#include "nvgraph_vector.hxx"
+
+namespace nvgraph{
+
+//this header file defines the various semirings using enum
+ enum Semiring
+ {//the datatype is assumed to be real unless otherwise specified in the name
+ 	PlusTimes, //standard matrix vector multiplication
+ 	MinPlus, //breadth first search-also called tropical
+ 	MaxMin, //mas flow problems
+ 	OrAndBool,
+ 	LogPlus
+ };	
+
+//Merge Path Coord array depends on the integere type
+template<typename IndexType_>
+struct Coord 
+{
+    IndexType_ x;
+    IndexType_ y;
+};
+
+//struct which stores the csr matrix format, templated on the index and value
+ template <typename IndexType_, typename ValueType_>
+ struct CsrMvParams {
+ 	ValueType_ alpha;
+ 	ValueType_ beta;
+ 	ValueType_ *csrVal; //nonzero values from matrix A
+ 	//row pointer must look at next address to avoid the 0 in merge path
+ 	IndexType_ *csrRowPtr; //row offsets last entry is number of nonzeros size is m +1
+ 	IndexType_ *csrColInd; //column indices of nonzeros
+ 	ValueType_ *x; //vector x in alpha*A*x
+ 	ValueType_ *y; //output y will be modified and store the output
+ 	IndexType_ m; //number of rows
+ 	IndexType_ n; //number of columns
+	IndexType_ nnz; 
+ };
+
+//create a device function interface to call the above dispatch function
+template <typename IndexType_, typename ValueType_>
+cudaError_t csrmv_mp(
+	IndexType_ n,
+	IndexType_ m, 
+	IndexType_ nnz,
+	ValueType_ alpha,
+	ValueType_ * dValues, //all must be preallocated on the device
+	IndexType_ * dRowOffsets,
+	IndexType_ * dColIndices,
+	ValueType_ *dVectorX,
+	ValueType_ beta,
+	ValueType_ *dVectorY,
+	Semiring SR,  //this parameter is of type enum and gives the semiring name
+	cudaStream_t stream = 0 );
+//overloaded function that has valued_csr_graph parameter to store the matrix
+template<typename IndexType_, typename ValueType_>
+cudaError_t csrmv_mp(
+	IndexType_ n,
+	IndexType_ m,
+	IndexType_ nnz,
+	ValueType_ alpha,
+	ValuedCsrGraph <IndexType_, ValueType_> network,
+	ValueType_ *dVectorX,
+	ValueType_ beta,
+	ValueType_ *dVectorY,
+	Semiring SR, //this parameter is of type enum and gives the semiring name
+	cudaStream_t stream = 0);	
+} //end nvgraph namespace
+
+template<typename IndexType_, typename ValueType_>
+void callTestCsrmv(IndexType_ num_rows, IndexType_ *dRowOffsets, IndexType_ *dColIndices, ValueType_ *dValues, 
+ 	ValueType_ *dVectorX, ValueType_ *dVectorY, nvgraph::Semiring SR, ValueType_ alpha, ValueType_ beta);
+
diff --git a/cpp/nvgraph/cpp/include/nvgraph_cublas.hxx b/cpp/nvgraph/cpp/include/nvgraph_cublas.hxx
new file mode 100644
index 00000000000..bddbbf18ae1
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/nvgraph_cublas.hxx
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+#pragma once
+
+#include <cublas_v2.h>
+#include <iostream>
+#include "debug_macros.h"
+
+namespace nvgraph
+{
+class Cublas;
+
+class Cublas
+{
+private:
+    static cublasHandle_t m_handle;
+    // Private ctor to prevent instantiation.
+    Cublas();
+    ~Cublas();
+public:
+
+    // Get the handle.
+    static cublasHandle_t get_handle()
+    {
+        if (m_handle == 0)
+            CHECK_CUBLAS(cublasCreate(&m_handle));
+        return m_handle;
+    }
+
+    static void destroy_handle()
+    {
+        if (m_handle != 0)
+            CHECK_CUBLAS(cublasDestroy(m_handle));
+        m_handle = 0;
+    }
+
+    static void set_pointer_mode_device();
+    static void set_pointer_mode_host();
+    static void setStream(cudaStream_t stream) 
+    {   
+        cublasHandle_t handle = Cublas::get_handle();
+        CHECK_CUBLAS(cublasSetStream(handle, stream));
+    }
+
+    template <typename T>
+    static void axpy(int n, T alpha,
+                     const T* x, int incx,
+                     T* y, int incy);
+
+    template <typename T>
+    static void copy(int n, const T* x, int incx,
+                     T* y, int incy);
+
+    template <typename T>
+    static void dot(int n, const T* x, int incx,
+                    const T* y, int incy,
+                    T* result);
+
+    template <typename T>
+    static void gemv(bool transposed, int m, int n,
+                     const T* alpha, const T* A, int lda,
+                     const T* x, int incx,
+                     const T* beta, T* y, int incy);
+
+    template <typename T>
+    static void gemv_ext(bool transposed, const int m, const int n,
+                     const T* alpha, const T* A, const int lda,
+                     const T* x, const int incx,
+                     const T* beta, T* y, const int incy, const int offsetx, const int offsety, const int offseta);
+    
+    template <typename T>
+    static void trsv_v2( cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, 
+			      const T *A, int lda, T *x, int incx, int offseta);
+
+    template <typename T>
+    static void ger(int m, int n, const T* alpha,
+                    const T* x, int incx,
+                    const T* y, int incy,
+                    T* A, int lda);
+
+    template <typename T>
+    static T nrm2(int n, const T* x, int incx);
+    template <typename T>
+    static void nrm2(int n, const T* x, int incx, T* result);
+
+    template <typename T>
+    static void scal(int n, T alpha, T* x, int incx);
+    template <typename T>
+    static void scal(int n, T* alpha, T* x, int incx);
+
+    template <typename T>
+    static void gemm(bool transa, bool transb, int m, int n, int k,
+		     const T * alpha, const T * A, int lda,
+		     const T * B, int ldb,
+		     const T * beta, T * C, int ldc);
+
+    template <typename T>
+    static void geam(bool transa, bool transb, int m, int n,
+		     const T * alpha, const T * A, int lda,
+		     const T * beta,  const T * B, int ldb,
+		     T * C, int ldc);
+
+};
+
+} // end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/include/nvgraph_cusparse.hxx b/cpp/nvgraph/cpp/include/nvgraph_cusparse.hxx
new file mode 100644
index 00000000000..09e8db487f5
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/nvgraph_cusparse.hxx
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+#pragma once
+
+#include <cusparse_v2.h>
+#include <cusparse_internal.h>
+#include "valued_csr_graph.hxx"
+#include "nvgraph_vector.hxx"
+
+#include <iostream>
+#include "debug_macros.h"
+
+namespace nvgraph
+{
+class Cusparse 
+{
+private:
+  // global CUSPARSE handle for nvgraph
+  static cusparseHandle_t m_handle; // Constructor.
+  Cusparse();
+  // Destructor.
+  ~Cusparse();
+
+public:
+
+  // Get the handle.
+  static cusparseHandle_t get_handle()
+  {
+      if (m_handle == 0)
+          CHECK_CUSPARSE(cusparseCreate(&m_handle));
+      return m_handle;
+  }
+  // Destroy handle
+  static void destroy_handle()
+  {
+    if (m_handle != 0)
+      CHECK_CUSPARSE( cusparseDestroy(m_handle) );
+    m_handle = 0;
+  }
+  static void setStream(cudaStream_t stream) 
+  {   
+      cusparseHandle_t handle = Cusparse::get_handle();
+      CHECK_CUSPARSE(cusparseSetStream(handle, stream));
+  }
+  // Set pointer mode
+  static void set_pointer_mode_device();
+  static void set_pointer_mode_host();
+
+  // operate on all rows and columns y= alpha*A.x + beta*y
+  template <typename IndexType_, typename ValueType_>
+  static void csrmv( const bool transposed,
+                     const bool sym,
+                     const int m, const int n, const int nnz, 
+                     const ValueType_* alpha, 
+                     const ValueType_* csrVal,
+                     const IndexType_ *csrRowPtr, 
+                     const IndexType_ *csrColInd, 
+                     const ValueType_* x,
+                     const ValueType_* beta, 
+                     ValueType_* y);
+  
+  template <typename IndexType_, typename ValueType_>
+  static void csrmv( const bool transposed,
+                     const bool sym,
+                     const ValueType_* alpha, 
+                     const ValuedCsrGraph<IndexType_, ValueType_>& G,
+                     const Vector<ValueType_>& x,
+                     const ValueType_* beta, 
+                     Vector<ValueType_>& y
+                     );
+  
+  // future possible features
+  /*
+  template <class TConfig>
+  static void csrmv_with_mask( const typename TConfig::MatPrec alphaConst, 
+                     Matrix<TConfig> &A, 
+                     Vector<TConfig> &x,
+                     const typename TConfig::MatPrec betaConst, 
+                     Vector<TConfig> &y );
+
+  template <class TConfig>
+  static void csrmv_with_mask_restriction( const typename TConfig::MatPrec alphaConst, 
+                     Matrix<TConfig> &A, 
+                     Vector<TConfig> &x,
+                     const typename TConfig::MatPrec betaConst, 
+                     Vector<TConfig> &y, 
+                     Matrix<TConfig> &P);
+
+  // E is a vector that represents a diagonal matrix
+  // operate on all rows and columns
+  // y= alpha*E.x + beta*y
+  template <class TConfig>
+  static void csrmv( const typename TConfig::MatPrec alphaConst, 
+                     Matrix<TConfig> &A, 
+                     const typename Matrix<TConfig>::MVector &E,
+                     Vector<TConfig> &x,
+                     const typename TConfig::MatPrec betaConst, 
+                     Vector<TConfig> &y, 
+                     ViewType view = OWNED );
+
+  // operate only on columns specified by columnColorSelector, see enum ColumnColorSelector above
+  // operate only on rows of specified color, given by A.offsets_rows_per_color, A.sorted_rows_by_color
+  // y= alpha*A.x + beta*y
+  template <class TConfig>
+  static void csrmv( ColumnColorSelector columnColorSelector, 
+                     const int color,
+                     const typename TConfig::MatPrec alphaConst, 
+                     Matrix<TConfig> &A, 
+                     Vector<TConfig> &x,
+                     const typename TConfig::MatPrec betaConst, 
+                     Vector<TConfig> &y, 
+                     ViewType view = OWNED );
+
+  // E is a vector that represents a diagonal matrix
+  // operate only on rows of specified color, given by A.offsets_rows_per_color, A.sorted_rows_by_color
+  // y= alpha*E.x + beta*y
+  template <class TConfig>
+  static void csrmv( const int color,
+                     typename TConfig::MatPrec alphaConst, 
+                     Matrix<TConfig> &A, 
+                     const typename Matrix<TConfig>::MVector &E, 
+                     Vector<TConfig> &x,
+                     typename TConfig::MatPrec betaConst, 
+                     Vector<TConfig> &y, 
+                     ViewType view=OWNED );
+
+  template <class TConfig>
+  static void csrmm(typename TConfig::MatPrec alpha,
+                    Matrix<TConfig> &A,
+                    Vector<TConfig> &V,
+                    typename TConfig::VecPrec beta,
+                    Vector<TConfig> &Res);
+
+*/
+
+ template <typename IndexType_, typename ValueType_>
+ static void csrmm(const bool transposed,
+                   const bool sym,
+                   const int m, 
+                   const int n, 
+                   const int k,
+                   const int nnz, 
+                   const ValueType_* alpha, 
+                   const ValueType_* csrVal,
+                   const IndexType_* csrRowPtr, 
+                   const IndexType_* csrColInd, 
+                   const ValueType_* x,
+                   const int ldx,
+                   const ValueType_* beta, 
+                   ValueType_* y,
+                   const int ldy);   
+
+ //template <typename IndexType_, typename ValueType_>
+ static void csr2coo( const int n, 
+                                     const int nnz, 
+                                     const int *csrRowPtr,
+                                     int *cooRowInd);   
+};
+
+} // end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/include/nvgraph_error.hxx b/cpp/nvgraph/cpp/include/nvgraph_error.hxx
new file mode 100644
index 00000000000..14815c83acd
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/nvgraph_error.hxx
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+#pragma once
+
+#include <stdio.h>
+#include <string>
+#include <sstream>
+#include <time.h>
+ 
+#include <stacktrace.h>
+
+//#define VERBOSE_DIAG
+//#define DEBUG 1
+
+namespace nvgraph {
+
+typedef void (*NVGRAPH_output_callback)(const char *msg, int length); 
+extern NVGRAPH_output_callback nvgraph_output;
+extern NVGRAPH_output_callback error_output;
+extern NVGRAPH_output_callback nvgraph_distributed_output;
+int nvgraph_printf(const char* fmt, ...);
+
+#if defined(DEBUG) || defined(VERBOSE_DIAG)
+#define nvgraph_printf_debug(fmt,...) nvgraph_printf(fmt,##__VA_ARGS__)
+#define device_printf(fmt,...) printf(fmt,##__VA_ARGS__)
+#else
+#define nvgraph_printf_debug(fmt,...)
+#define device_printf(fmt,...)
+#endif
+
+// print stacktrace only in debug mode
+#if defined(DEBUG) || defined(VERBOSE_DIAG)
+#define STACKTRACE "\nStack trace:\n" + std::string(e.trace())
+#define WHERE " at: " << __FILE__ << ':' << __LINE__
+#else
+#define STACKTRACE ""
+#define WHERE ""
+#endif 
+
+
+enum NVGRAPH_ERROR { 
+/*********************************************************
+ * Flags for status reporting
+ *********************************************************/
+    NVGRAPH_OK=0, 
+    NVGRAPH_ERR_BAD_PARAMETERS=1,
+    NVGRAPH_ERR_UNKNOWN=2,
+    NVGRAPH_ERR_CUDA_FAILURE=3,
+    NVGRAPH_ERR_THRUST_FAILURE=4,
+    NVGRAPH_ERR_IO=5,
+    NVGRAPH_ERR_NOT_IMPLEMENTED=6,
+    NVGRAPH_ERR_NO_MEMORY=7,
+    NVGRAPH_ERR_NOT_CONVERGED=8
+};
+
+// define our own bad_alloc so we can set its .what()
+class nvgraph_exception: public std::exception
+{
+  public:
+    inline nvgraph_exception(const std::string &w, const std::string &where, const std::string &trace, NVGRAPH_ERROR reason) : m_trace(trace), m_what(w), m_reason(reason), m_where(where)
+    {
+    }
+
+    inline virtual ~nvgraph_exception(void) throw () {};
+
+    inline virtual const char *what(void) const throw()
+    {
+      return m_what.c_str();
+    }
+    inline virtual const char *where(void) const throw()
+    {
+      return m_where.c_str();
+    }
+    inline virtual const char *trace(void) const throw()
+    {
+      return m_trace.c_str();
+    }
+    inline virtual NVGRAPH_ERROR reason(void) const throw()
+    {
+      return m_reason;
+    }
+
+
+  private:
+    std::string  m_trace;
+    std::string  m_what;
+    NVGRAPH_ERROR m_reason;
+    std::string  m_where;
+}; // end bad_alloc
+  
+
+int NVGRAPH_GetErrorString( NVGRAPH_ERROR error, char* buffer, int buf_len);
+
+/********************************************************
+ * Prints the error message, the stack trace, and exits
+ * ******************************************************/
+#define FatalError(s, reason) {                                                 \
+  std::stringstream _where;                                                     \
+  _where << WHERE ;                                                             \
+  std::stringstream _trace;                                                     \
+  printStackTrace(_trace);                                                      \
+  throw nvgraph_exception(std::string(s) + "\n", _where.str(), _trace.str(), reason); \
+}
+
+#undef cudaCheckError
+#if defined(DEBUG) || defined(VERBOSE_DIAG)
+#define cudaCheckError() {                                              \
+  cudaError_t e=cudaGetLastError();                                     \
+  if(e!=cudaSuccess) {                                                  \
+    std::stringstream _error;                                           \
+    _error << "Cuda failure: '" << cudaGetErrorString(e) << "'";        \
+    FatalError(_error.str(), NVGRAPH_ERR_CUDA_FAILURE);                 \
+  }                                                                     \
+}
+#else // NO DEBUG
+#define cudaCheckError()                                                      \
+    {                                                                         \
+        cudaError_t __e = cudaGetLastError();                                 \
+        if (__e != cudaSuccess) {                                             \
+            FatalError("", NVGRAPH_ERR_CUDA_FAILURE);                         \
+        }                                                                     \
+    }
+#endif
+
+#define CHECK_CUDA(call)                                                      \
+    {                                                                         \
+        cudaError_t _e = (call);                                              \
+        if (_e != cudaSuccess)                                                \
+        {                                                                     \
+            std::stringstream _error;                                         \
+            _error << "CUDA Runtime failure: '#" << _e << "'";                \
+            FatalError(_error.str(), NVGRAPH_ERR_CUDA_FAILURE);               \
+        }                                                                     \
+    }
+
+#define CHECK_CURAND(call)                                                    \
+    {                                                                         \
+        curandStatus_t _e = (call);                                           \
+        if (_e != CURAND_STATUS_SUCCESS)                                      \
+        {                                                                     \
+            std::stringstream _error;                                         \
+            _error << "CURAND failure: '#" << _e << "'";                      \
+            FatalError(_error.str(), NVGRAPH_ERR_CUDA_FAILURE);               \
+        }                                                                     \
+    }
+
+#define CHECK_CUBLAS(call)                                                    \
+    {                                                                         \
+        cublasStatus_t _e = (call);                                           \
+        if (_e != CUBLAS_STATUS_SUCCESS)                                      \
+        {                                                                     \
+            std::stringstream _error;                                         \
+            _error << "CUBLAS failure: '#" << _e << "'";                      \
+            FatalError(_error.str(), NVGRAPH_ERR_CUDA_FAILURE);               \
+        }                                                                     \
+    }
+
+#define CHECK_CUSPARSE(call)                                                  \
+    {                                                                         \
+        cusparseStatus_t _e = (call);                                         \
+        if (_e != CUSPARSE_STATUS_SUCCESS)                                    \
+        {                                                                     \
+            std::stringstream _error;                                         \
+            _error << "CURAND failure: '#" << _e << "'";                      \
+            FatalError(_error.str(), NVGRAPH_ERR_CUDA_FAILURE);               \
+        }                                                                     \
+    }
+
+#define CHECK_CUSOLVER(call)                                                  \
+    {                                                                         \
+        cusolverStatus_t _e = (call);                                         \
+        if (_e != CUSOLVER_STATUS_SUCCESS)                                    \
+        {                                                                     \
+            std::stringstream _error;                                         \
+            _error << "CURAND failure: '#" << _e << "'";                      \
+            FatalError(_error.str(), NVGRAPH_ERR_CUDA_FAILURE);               \
+        }                                                                     \
+    }
+
+#define NVGRAPH_CATCHES(rc) catch (nvgraph_exception e) {                                            \
+    std::string err = "Caught nvgraph exception: " + std::string(e.what())                           \
+        + std::string(e.where()) + STACKTRACE + "\n";                                                \
+    error_output(err.c_str(), static_cast<int>(err.length()));                                       \
+    rc = e.reason();                                                                                 \
+  } catch (std::bad_alloc e) {                                                                       \
+    std::string err = "Not enough memory: " + std::string(e.what())                                  \
+        + "\nFile and line number are not available for this exception.\n";                          \
+    error_output(err.c_str(), static_cast<int>(err.length()));                                       \
+    rc = NVGRAPH_ERR_NO_MEMORY;                                                                      \
+  } catch (std::exception e) {                                                                       \
+    std::string err = "Caught unknown exception: " + std::string(e.what())                           \
+        + "\nFile and line number are not available for this exception.\n";                          \
+    error_output(err.c_str(), static_cast<int>(err.length()));                                       \
+    rc = NVGRAPH_ERR_UNKNOWN;                                                                        \
+  } catch (...) {                                                                                    \
+    std::string err =                                                                                \
+        "Caught unknown exception\nFile and line number are not available for this exception.\n";    \
+    error_output(err.c_str(), static_cast<int>(err.length()));                                       \
+    rc = NVGRAPH_ERR_UNKNOWN;                                                                        \
+  }
+
+// Since there is no global-level thrust dependency, we don't include this globally. May add later
+  /*
+  catch (thrust::system_error &e) {                                                                \
+    std::string err = "Thrust failure: " + std::string(e.what())                                     \
+        + "\nFile and line number are not available for this exception.\n";                          \
+    error_output(err.c_str(), static_cast<int>(err.length()));                                       \
+    rc = NVGRAPH_ERR_THRUST_FAILURE;                                                                    \
+  } catch (thrust::system::detail::bad_alloc e) {                                                    \
+    std::string err = "Thrust failure: " + std::string(e.what())                                     \
+        + "\nFile and line number are not available for this exception.\n";                          \
+    error_output(err.c_str(), static_cast<int>(err.length()));                                       \
+    rc = NVGRAPH_ERR_NO_MEMORY;                                                                      \
+  } 
+  */
+
+
+
+  // simple cuda timer
+  // can be called in cpp files
+  class cuda_timer {
+  public:
+   cuda_timer(); 
+   void start();
+   float stop(); // in ms
+  private:
+    struct event_pair;
+    event_pair* p;    
+  };
+
+} // namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/include/nvgraph_experimental.h b/cpp/nvgraph/cpp/include/nvgraph_experimental.h
new file mode 100644
index 00000000000..0ce29cc7fad
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/nvgraph_experimental.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Internal header of NVGRAPH library
+//
+//
+// WARNING:
+// This header give access to experimental feature and internal routines that are not in the official API
+//
+//
+#include "nvgraph.h"
+
+
+#ifdef __cplusplus
+#include "cstdio"
+#else
+#include "stdio.h"
+#endif
+
+#ifndef NVGRAPH_API
+#ifdef _WIN32
+#define NVGRAPH_API __stdcall
+#else
+#define NVGRAPH_API 
+#endif
+#endif
+
+#ifdef __cplusplus
+  extern "C" {
+#endif
+
+/* Edge matching types */
+typedef enum
+{
+   NVGRAPH_UNSCALED  = 0, // using edge values as is
+   NVGRAPH_SCALED_BY_ROW_SUM   = 1,  // 0.5*(A_ij+A_ji)/max(d(i),d (j)), where d(i) is the sum of the row i
+   NVGRAPH_SCALED_BY_DIAGONAL   = 2,  // 0.5*(A_ij+A_ji)/max(diag(i),diag(j)) 
+} nvgraphEdgeWeightMatching_t;
+ 
+
+nvgraphStatus_t NVGRAPH_API nvgraphSpectralModularityMaximization(nvgraphHandle_t handle, 
+                                   const nvgraphGraphDescr_t graph_descr, 
+                                   const size_t weight_index,
+                                   const int n_clusters, 
+                                   const int n_eig_vects,
+                                   const float evs_tolerance,
+                                   const int evs_max_iter,
+                                   const float kmean_tolerance,
+                                   const int kmean_max_iter,
+                                   int* clustering,
+                                   void* eig_vals,
+                                   void* eig_vects); 
+
+nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeModularityClustering(nvgraphHandle_t handle, 
+                                             const nvgraphGraphDescr_t graph_descr, 
+                                             const size_t weight_index,
+                                             const int clusters,
+                                             const int* clustering,
+                                             float * modularity);
+
+nvgraphStatus_t NVGRAPH_API nvgraphHeavyEdgeMatching(nvgraphHandle_t handle, 
+                                             const nvgraphGraphDescr_t graph_descr, 
+                                             const size_t weight_index,
+                                             const nvgraphEdgeWeightMatching_t similarity_metric,
+                                             int* aggregates,
+                                             size_t* n_aggregates);
+
+nvgraphStatus_t NVGRAPH_API nvgraphBalancedCutClustering(nvgraphHandle_t handle, 
+                                   const nvgraphGraphDescr_t graph_descr, 
+                                   const size_t weight_index,
+                                   const int n_clusters, 
+                                   const int n_eig_vects,
+                                   const int evs_type,
+                                   const float evs_tolerance,
+                                   const int evs_max_iter,
+                                   const float kmean_tolerance,
+                                   const int kmean_max_iter,
+                                   int* clustering,
+                                   void* eig_vals,
+                                   void* eig_vects); 
+
+nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeBalancedCut(nvgraphHandle_t handle, 
+                                             const nvgraphGraphDescr_t graph_descr, 
+                                             const size_t weight_index,
+                                             const int n_clusters,
+                                             const int* clustering,
+                                             float * edgeCut, 
+                                             float * ratioCut);
+
+nvgraphStatus_t NVGRAPH_API nvgraphKrylovPagerank(nvgraphHandle_t handle, 
+                                   const nvgraphGraphDescr_t graph_descr, 
+                                   const size_t weight_index,
+                                   const void *alpha, 
+                                   const size_t bookmark_index,                                   
+                                   const float tolerance, 
+                                   const int max_iter, 
+                                   const int subspace_size, 
+                                   const int has_guess,
+                                   const size_t pagerank_index); 
+
+#if defined(__cplusplus) 
+} //extern "C"
+#endif
+
diff --git a/cpp/nvgraph/cpp/include/nvgraph_lapack.hxx b/cpp/nvgraph/cpp/include/nvgraph_lapack.hxx
new file mode 100644
index 00000000000..a230db57258
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/nvgraph_lapack.hxx
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+#pragma once
+#include <nvgraph_error.hxx>
+namespace nvgraph
+{
+template <typename T> class Lapack;
+
+template <typename T>
+class Lapack
+{
+private:
+    Lapack();
+    ~Lapack();
+public:
+	static void check_lapack_enabled();
+
+	static void gemm(bool transa, bool transb, int m, int n, int k, T alpha, const T * A, int lda, const T * B, int ldb, T beta, T * C, int ldc);
+
+	// special QR for lanczos
+	static void sterf(int n, T * d, T * e);
+	static void steqr(char compz, int n, T * d, T * e, T * z, int ldz, T * work);
+
+	// QR
+	// computes the QR factorization of a general matrix
+	static void geqrf (int m, int n, T *a, int lda, T *tau, T *work, int *lwork);
+	// Generates the real orthogonal matrix Q of the QR factorization formed by geqrf.
+	//static void orgqr( int m, int n, int k, T* a, int lda, const T* tau, T* work, int* lwork );
+	// multiply C by implicit Q
+	static void ormqr (bool right_side, bool transq, int m, int n, int k, T *a, int lda, T *tau, T *c, int ldc, T *work, int *lwork);
+	//static void unmqr (bool right_side, bool transq, int m, int n, int k, T *a, int lda, T *tau, T *c, int ldc, T *work, int *lwork);
+    //static void qrf (int n, T *H, T *Q, T *R);
+
+    //static void hseqr (T* Q, T* R, T* eigenvalues,T* eigenvectors, int dim, int ldh, int ldq);
+	static void geev(T* A, T* eigenvalues, int dim, int lda);
+	static void geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr);
+	static void geev(T* A, T* eigenvalues_r, T* eigenvalues_i, T* eigenvectors_r, T* eigenvectors_i, int dim, int lda, int ldvr);
+
+};
+}  // end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/include/nvgraph_vector.hxx b/cpp/nvgraph/cpp/include/nvgraph_vector.hxx
new file mode 100644
index 00000000000..33a69e9c1a1
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/nvgraph_vector.hxx
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+#pragma once
+#include <cnmem_shared_ptr.hxx>
+#include "nvgraph_error.hxx"
+#include "nvgraph_vector_kernels.hxx"
+
+#include "debug_macros.h"
+
+namespace nvgraph
+{
+
+/*! A Vector contains a device vector of size |E| and type T
+ */
+template <typename ValueType_>
+class Vector 
+{
+public:
+    //typedef IndexType_ IndexType;
+    typedef ValueType_ ValueType;
+
+protected:
+    /*! Storage for the values.
+     */
+    SHARED_PREFIX::shared_ptr<ValueType> values;
+
+    /*! Size of the array
+     */
+    size_t size;
+
+    /*! Storage for a cuda stream
+     */
+    //, cudaStream_t stream = 0
+
+public:
+        
+    /*! Construct an empty \p Vector.
+     */
+    Vector(void) {}
+    ~Vector(void) {}
+    /*! Construct a \p Vector of size vertices.
+     *
+     *  \param vertices The size of the Vector
+     */
+    Vector(size_t vertices, cudaStream_t stream = 0)
+        : values(allocateDevice<ValueType>(vertices, stream)),
+          size(vertices) {}
+
+    
+    size_t get_size() const { return size; }
+    size_t bytes() const { return size*sizeof(ValueType);}
+    ValueType* raw() const { return values.get(); }
+    //cudaStream_t get_stream() const { return stream_; }
+    void allocate(size_t n, cudaStream_t stream = 0) 
+    {
+        size = n; 
+        values = allocateDevice<ValueType>(n, stream); 
+    }
+
+    void attach(size_t n, ValueType* vals, cudaStream_t stream = 0) 
+    {
+        size = n;
+        values = attachDevicePtr<ValueType>(vals, stream); 
+    }
+
+    Vector(size_t vertices, ValueType * vals, cudaStream_t stream = 0)
+        : values(attachDevicePtr<ValueType>(vals, stream)),
+          size(vertices) {}
+
+    void fill(ValueType val, cudaStream_t stream = 0) 
+    {
+        fill_raw_vec(this->raw(), this->get_size(), val, stream); 
+    } 
+    void copy(Vector<ValueType> &vec1, cudaStream_t stream = 0)
+    {
+        if (this->get_size() == 0 && vec1.get_size()>0)
+        {
+            allocate(vec1.get_size(), stream);
+            copy_vec(vec1.raw(), this->get_size(), this->raw(), stream);
+        }
+        else if (this->get_size() == vec1.get_size()) 
+            copy_vec(vec1.raw(),  this->get_size(), this->raw(), stream);
+        else if (this->get_size() > vec1.get_size()) 
+        {
+            //COUT() << "Warning Copy : sizes mismatch "<< this->get_size() <<':'<< vec1.get_size() <<std::endl;
+            copy_vec(vec1.raw(),  vec1.get_size(), this->raw(), stream);
+            //dump_raw_vec (this->raw(), vec1.get_size(), 0);
+        }
+        else
+        {
+            FatalError("Cannot copy a vector into a smaller one", NVGRAPH_ERR_BAD_PARAMETERS);
+        }
+    }
+    void dump(size_t off, size_t sz, cudaStream_t stream = 0)
+    {
+        if ((off+sz)<= this->size) 
+            dump_raw_vec(this->raw(), sz, off, stream);
+        else
+            FatalError("Offset and Size values doesn't make sense", NVGRAPH_ERR_BAD_PARAMETERS);
+    }
+    void flag_zeros(Vector<int> & flags, cudaStream_t stream = 0) 
+    {
+        flag_zeros_raw_vec(this->get_size(), this->raw(), flags.raw(), stream);
+    }
+
+    ValueType nrm1(cudaStream_t stream = 0) 
+    { 
+        ValueType res = 0;
+        nrm1_raw_vec(this->raw(), this->get_size(), &res, stream);
+        return res;
+    }
+}; // class Vector
+} // end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/include/nvgraph_vector_kernels.hxx b/cpp/nvgraph/cpp/include/nvgraph_vector_kernels.hxx
new file mode 100644
index 00000000000..9a0e640044a
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/nvgraph_vector_kernels.hxx
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+#pragma once
+namespace nvgraph
+{
+	template <typename ValueType_>
+	void nrm1_raw_vec (ValueType_* vec, size_t n, ValueType_* res, cudaStream_t stream = 0);
+
+ 	template <typename ValueType_>
+	void fill_raw_vec (ValueType_* vec, size_t n, ValueType_ value, cudaStream_t stream = 0);
+
+	template <typename ValueType_>
+	void dump_raw_vec (ValueType_* vec, size_t n, int offset, cudaStream_t stream = 0);
+
+	template <typename ValueType_>
+	void dmv (size_t num_vertices, ValueType_ alpha, ValueType_* D, ValueType_* x, ValueType_ beta, ValueType_* y, cudaStream_t stream = 0);
+
+	template<typename ValueType_>
+	void copy_vec(ValueType_ *vec1, size_t n, ValueType_ *res, cudaStream_t stream = 0);
+
+	template <typename ValueType_>
+	void flag_zeros_raw_vec(size_t num_vertices, ValueType_* vec, int* flag, cudaStream_t stream = 0 );
+
+	template <typename IndexType_, typename ValueType_>
+	void set_connectivity( size_t n, IndexType_ root, ValueType_ self_loop_val, ValueType_ unreachable_val, ValueType_* res, cudaStream_t stream = 0);
+
+} // end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/include/nvlouvain.cuh b/cpp/nvgraph/cpp/include/nvlouvain.cuh
new file mode 100644
index 00000000000..9644a17d40d
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/nvlouvain.cuh
@@ -0,0 +1,797 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <string>
+#include <cstring>
+#include <vector>
+#include <cmath>
+#include <fstream>
+#include <chrono>
+
+#include <cuda.h>
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+#include <cusparse.h>
+
+#include "graph_utils.cuh"
+#include "modularity.cuh"
+#include "delta_modularity.cuh"
+#include "high_res_clock.h"
+#include "size2_selector.cuh"
+#include "thrust_coarse_generator.cuh"
+
+namespace nvlouvain{
+
+//#define VERBOSE true
+
+#define LOG() (log<<COLOR_GRN<<"[ "<< time_now() <<" ] "<<COLOR_WHT)
+
+
+/*
+The main program of louvain
+*/
+template<typename IdxType=int, typename ValType>
+NVLOUVAIN_STATUS louvain(IdxType* csr_ptr, IdxType* csr_ind, ValType* csr_val,
+                  const size_t num_vertex, const size_t num_edges, 
+                  bool& weighted, bool has_init_cluster,
+                  IdxType* init_cluster, // size = n_vertex
+                  ValType& final_modularity,
+                  IdxType* cluster_vec, // size = n_vertex
+                  IdxType& num_level,
+                  std::ostream& log = std::cout){
+#ifndef ENABLE_LOG
+  log.setstate(std::ios_base::failbit);
+#endif
+  num_level = 0;
+  cusparseHandle_t cusp_handle;
+  cusparseCreate(&cusp_handle);
+
+  int n_edges = num_edges;
+  int n_vertex = num_vertex;
+
+  thrust::device_vector<IdxType> csr_ptr_d(csr_ptr, csr_ptr + n_vertex + 1);
+  thrust::device_vector<IdxType> csr_ind_d(csr_ind, csr_ind + n_edges);
+  thrust::device_vector<ValType> csr_val_d(csr_val, csr_val + n_edges);
+
+  //std::vector<IdxType> clustering(n_vertex);
+  thrust::device_vector<IdxType> clustering(n_vertex);
+  int upper_bound = 100;
+
+  HighResClock hr_clock;
+  double timed, diff_time;
+  //size_t mem_tot= 0;
+  //size_t mem_free = 0;
+
+  int c_size(n_vertex);
+  unsigned int best_c_size = (unsigned) n_vertex;
+  unsigned current_n_vertex(n_vertex);
+  int num_aggregates(n_edges);
+  ValType m2 = thrust::reduce(thrust::cuda::par, csr_val_d.begin(), csr_val_d.begin() + n_edges);
+
+  ValType best_modularity = -1;
+
+  thrust::device_vector<IdxType> new_csr_ptr(n_vertex, 0);
+  thrust::device_vector<IdxType> new_csr_ind(n_edges, 0);
+  thrust::device_vector<ValType> new_csr_val(n_edges, 0);
+
+  thrust::device_vector<IdxType> cluster_d(n_vertex);
+  thrust::device_vector<IdxType> aggregates_tmp_d(n_vertex, 0);
+  thrust::device_vector<IdxType> cluster_inv_ptr(c_size + 1, 0);
+  thrust::device_vector<IdxType> cluster_inv_ind(n_vertex, 0);
+  thrust::device_vector<ValType> k_vec(n_vertex, 0);
+  thrust::device_vector<ValType> Q_arr(n_vertex, 0);
+  thrust::device_vector<ValType> delta_Q_arr(n_edges, 0);
+  thrust::device_vector<ValType> cluster_sum_vec(c_size, 0);
+  thrust::host_vector<IdxType> best_cluster_h(n_vertex, 0);
+  Vector<IdxType> aggregates((int) current_n_vertex, 0);
+
+  IdxType* cluster_inv_ptr_ptr = thrust::raw_pointer_cast(cluster_inv_ptr.data());
+  IdxType* cluster_inv_ind_ptr = thrust::raw_pointer_cast(cluster_inv_ind.data());
+  IdxType* csr_ptr_ptr = thrust::raw_pointer_cast(csr_ptr_d.data());
+  IdxType* csr_ind_ptr = thrust::raw_pointer_cast(csr_ind_d.data());
+  ValType* csr_val_ptr = thrust::raw_pointer_cast(csr_val_d.data());
+  IdxType* cluster_ptr = thrust::raw_pointer_cast(cluster_d.data());
+
+  if(!has_init_cluster){
+    // if there is no initialized cluster
+    // the cluster as assigned as a sequence (a cluster for each vertex)
+    // inv_clusters will also be 2 sequence
+    thrust::sequence(thrust::cuda::par, cluster_d.begin(), cluster_d.end());
+    thrust::sequence(thrust::cuda::par, cluster_inv_ptr.begin(), cluster_inv_ptr.end());
+    thrust::sequence(thrust::cuda::par, cluster_inv_ind.begin(), cluster_inv_ind.end());
+  }
+  else{
+    // assign initialized cluster to cluster_d device vector
+    // generate inverse cluster in CSR formate
+    if(init_cluster == nullptr){
+      final_modularity = -1;
+      return NVLOUVAIN_ERR_BAD_PARAMETERS;
+    }
+
+    thrust::copy(init_cluster, init_cluster + n_vertex , cluster_d.begin());
+    generate_cluster_inv(current_n_vertex, c_size, cluster_d.begin(), cluster_inv_ptr, cluster_inv_ind);
+  }
+ 
+  dim3 block_size_1d((n_vertex + BLOCK_SIZE_1D -1)/ BLOCK_SIZE_1D, 1, 1);
+  dim3 grid_size_1d(BLOCK_SIZE_1D, 1, 1); 
+  dim3 block_size_2d((n_vertex + BLOCK_SIZE_2D -1)/ BLOCK_SIZE_2D, (n_vertex + BLOCK_SIZE_2D -1)/ BLOCK_SIZE_2D, 1);
+  dim3 grid_size_2d(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1);
+
+  ValType* k_vec_ptr = thrust::raw_pointer_cast(k_vec.data());
+  ValType* Q_arr_ptr = thrust::raw_pointer_cast(Q_arr.data());
+  ValType* cluster_sum_vec_ptr = thrust::raw_pointer_cast(cluster_sum_vec.data());
+  ValType* delta_Q_arr_ptr =  thrust::raw_pointer_cast(delta_Q_arr.data());
+
+  ValType new_Q, cur_Q, delta_Q, delta_Q_final;
+  unsigned old_c_size(c_size); 
+  bool updated = true;
+
+  hr_clock.start();
+  // Get the initialized modularity
+  new_Q = modularity( n_vertex, n_edges, c_size, m2,
+              csr_ptr_ptr, csr_ind_ptr, csr_val_ptr,
+              cluster_ptr, cluster_inv_ptr_ptr, cluster_inv_ind_ptr, 
+              weighted, k_vec_ptr, Q_arr_ptr, delta_Q_arr_ptr); // delta_Q_arr_ptr is temp_i
+
+
+  hr_clock.stop(&timed);
+  diff_time = timed;
+
+  LOG()<<"Initial modularity value: "<<COLOR_MGT<<new_Q<<COLOR_WHT<<" runtime: "<<diff_time/1000<<"\n";  
+
+  bool contin(true);
+  int bound = 0;
+  int except = 3;
+
+  
+  do{ 
+    bound = 0;  
+    
+
+    block_size_1d = dim3((current_n_vertex + BLOCK_SIZE_1D -1)/ BLOCK_SIZE_1D, 1, 1);
+    grid_size_1d  = dim3(BLOCK_SIZE_1D, 1, 1); 
+    cur_Q = new_Q;
+    old_c_size = c_size;
+
+#ifdef VERBOSE
+      LOG()<<"Current cluster inv: \n";
+      nvlouvain::display_vec(cluster_inv_ptr, log);
+      nvlouvain::display_vec(cluster_inv_ind, log);
+#endif
+
+
+    hr_clock.start();
+    // Compute delta modularity for each edges
+    build_delta_modularity_vector(cusp_handle, current_n_vertex, c_size, m2, updated, 
+                                  csr_ptr_d, csr_ind_d, csr_val_d, 
+                                  cluster_d, 
+                                  cluster_inv_ptr_ptr, cluster_inv_ind_ptr, 
+                                  k_vec_ptr, cluster_sum_vec_ptr, delta_Q_arr_ptr);
+     
+    //display_vec(delta_Q_arr);
+    hr_clock.stop(&timed);
+    diff_time = timed;
+    LOG()<<"Complete build_delta_modularity_vector  runtime: "<<diff_time/1000<<"\n";
+    //LOG()<<"Initial modularity value: "<<COLOR_MGT<<new_Q<<COLOR_WHT<<" runtime: "<<diff_time/1000<<"\n";  
+
+
+    //  Start aggregates 
+    Matching_t config = nvlouvain::USER_PROVIDED;
+    //Size2Selector<IdxType, ValType> size2_sector(config, 0, 50, 0.6, true, false, 0);
+    int agg_deterministic = 1;
+    int agg_max_iterations = 25;
+    ValType agg_numUnassigned_tol = 0.85;
+    bool agg_two_phase =  false;
+    bool agg_merge_singletons = true;
+    
+
+    if (current_n_vertex<8)
+    {
+      agg_merge_singletons = false;
+      //agg_max_iterations = 4;
+    }
+
+
+    Size2Selector<IdxType, ValType> size2_sector(config, agg_deterministic, agg_max_iterations, agg_numUnassigned_tol, agg_two_phase, agg_merge_singletons, 0); 
+
+    //hollywood-2009 0.5
+
+
+#ifdef DEBUG
+    if((unsigned)cluster_d.size()!= current_n_vertex)
+      //LOG()<<"Error cluster_d.size()!= current_n_verte:qx"<< cluster_d.size() <<" != "<< current_n_vertex <<"\n";
+#endif 
+
+#ifdef VERBOSE
+    //LOG()<<"n_vertex: "<< csr_ptr_d.size()<<" "<<csr_ind_d.size()<< " " << csr_val_d.size()<<" a_size: "<<aggregates.size()<<std::endl;
+#endif
+
+    hr_clock.start();
+    size2_sector.setAggregates(cusp_handle, current_n_vertex, n_edges, csr_ptr_ptr, csr_ind_ptr, csr_val_ptr , aggregates, num_aggregates);
+    CUDA_CALL(cudaDeviceSynchronize());
+    hr_clock.stop(&timed);
+    diff_time = timed;
+
+    LOG()<<"Complete aggregation size: "<< num_aggregates<<" runtime: "<<diff_time/1000<<std::endl;
+
+    // Done aggregates 
+    c_size = num_aggregates;
+    thrust::copy(thrust::device, aggregates.begin(), aggregates.begin() + current_n_vertex, cluster_d.begin());
+    weighted = true;
+
+    // start update modularty 
+    hr_clock.start();
+    CUDA_CALL(cudaDeviceSynchronize());
+
+    generate_cluster_inv(current_n_vertex, c_size, cluster_d.begin(), cluster_inv_ptr, cluster_inv_ind);
+    CUDA_CALL(cudaDeviceSynchronize());
+
+    hr_clock.stop(&timed);
+    diff_time = timed;
+
+    LOG()<<"Complete generate_cluster_inv runtime: "<<diff_time/1000<<std::endl;
+
+
+#ifdef VERBOSE     
+      display_vec(cluster_inv_ptr, log);
+      display_vec(cluster_inv_ind, log);
+#endif
+
+
+    hr_clock.start();
+    new_Q = modularity(current_n_vertex, n_edges, c_size, m2,
+                       csr_ptr_ptr, csr_ind_ptr, csr_val_ptr,
+                       cluster_ptr, cluster_inv_ptr_ptr, cluster_inv_ind_ptr,
+                       weighted, k_vec_ptr, Q_arr_ptr, delta_Q_arr_ptr); //delta_Q_arr_ptr is temp_i and Q_arr is also temp store
+
+
+    hr_clock.stop(&timed);
+    diff_time = timed;
+    // Done update modularity
+   
+    delta_Q = new_Q - cur_Q; 
+    
+
+    if(best_modularity < new_Q){
+      best_c_size = c_size;
+    }
+
+    LOG()<<"modularity: "<<COLOR_MGT<<new_Q<<COLOR_WHT
+       <<" delta modularity: " <<delta_Q
+       <<" best_modularity:"<< min(best_modularity, new_Q)
+       <<" moved: "<< (old_c_size - best_c_size)
+       <<" runtime: "<<diff_time/1000<<std::endl;  
+
+
+    // start shinking graph
+    if(best_modularity < new_Q ){
+
+      LOG()<< "Start Update best cluster\n";
+      updated = true;
+      num_level ++;
+
+      thrust::copy(thrust::device, cluster_d.begin(), cluster_d.begin() + current_n_vertex, aggregates_tmp_d.begin());
+
+      // if we would like to record the best cluster assignment for each level 
+      // we push back current cluster assignment to cluster_vec
+      //TODO
+
+      best_modularity = new_Q;
+      best_c_size = c_size;
+
+      hr_clock.start(); 
+      // generate super vertices graph 
+      generate_superverticies_graph(current_n_vertex, best_c_size, 
+                                    csr_ptr_d, csr_ind_d, csr_val_d, 
+                                    new_csr_ptr, new_csr_ind, new_csr_val, 
+                                    aggregates_tmp_d);
+
+      CUDA_CALL(cudaDeviceSynchronize());
+      if(current_n_vertex == num_vertex){
+        // copy inital aggregates assignments as initial clustering
+        thrust::copy(thrust::device, aggregates_tmp_d.begin(), aggregates_tmp_d.begin() +  current_n_vertex, clustering.begin());
+      } 
+      else{
+        // update, clustering[i] = aggregates[clustering[i]];
+        update_clustering((int)num_vertex, thrust::raw_pointer_cast(clustering.data()), thrust::raw_pointer_cast(aggregates_tmp_d.data()));
+      }
+      hr_clock.stop(&timed);
+      diff_time = timed;
+      LOG() <<"Complete generate_superverticies_graph size of graph: "<<current_n_vertex<<" -> "<<best_c_size<<" runtime: "<<diff_time/1000<<std::endl;
+  
+      // update cluster_d as a sequence
+      thrust::sequence(thrust::cuda::par, cluster_d.begin(), cluster_d.begin() + current_n_vertex);  
+      cudaCheckError(); 
+    
+      // generate cluster inv in CSR form as sequence
+      thrust::sequence(thrust::cuda::par, cluster_inv_ptr.begin(), cluster_inv_ptr.begin() + best_c_size+1);
+      thrust::sequence(thrust::cuda::par, cluster_inv_ind.begin(), cluster_inv_ind.begin() + best_c_size);
+
+      cluster_inv_ptr_ptr = thrust::raw_pointer_cast(cluster_inv_ptr.data());
+      cluster_inv_ind_ptr = thrust::raw_pointer_cast(cluster_inv_ind.data());
+      
+      //display_vec(cluster_inv_ind, log);
+      hr_clock.start(); 
+      // get new modularity after we generate super vertices. 
+      IdxType* new_csr_ptr_ptr = thrust::raw_pointer_cast(new_csr_ptr.data());
+      IdxType* new_csr_ind_ptr = thrust::raw_pointer_cast(new_csr_ind.data());
+      ValType* new_csr_val_ptr = thrust::raw_pointer_cast(new_csr_val.data());
+
+
+      new_Q = modularity( best_c_size, n_edges, best_c_size, m2,
+                          new_csr_ptr_ptr, new_csr_ind_ptr, new_csr_val_ptr,
+                          cluster_ptr, cluster_inv_ptr_ptr, cluster_inv_ind_ptr,
+                          weighted, k_vec_ptr, Q_arr_ptr, delta_Q_arr_ptr);
+      
+      hr_clock.stop(&timed);
+
+      diff_time = timed;
+ 
+      // modularity keeps the same after we generate super vertices 
+      // shouldn't happen
+      if(std::fabs(new_Q - best_modularity) > 0.0001){
+        
+        printf("Warning new_Q != best_Q %f != %f \n", new_Q, best_modularity);
+#if 0
+        printf("best_c_size = %d\n", best_c_size);
+
+        std::ofstream ouf("./log/Error_"+time_now()+".log");
+        display_vec(aggregates_tmp_d, ouf);
+        ouf<<"Error new_Q != best_Q "<< new_Q<<" != "<< best_modularity<<"\n";
+        ouf<<"old graph with size =  "<<current_n_vertex<< "\n";
+        display_vec(csr_ptr_d, ouf);  
+        display_vec(csr_ind_d, ouf);  
+        display_vec(csr_val_d, ouf);  
+ 
+        ouf<<"new graph \n";
+        display_vec(new_csr_ptr, ouf);
+        display_vec(new_csr_ind, ouf);
+        display_vec(new_csr_val, ouf);
+
+        generate_cluster_inv(current_n_vertex, c_size, aggregates_tmp_d.begin(), cluster_inv_ptr, cluster_inv_ind);
+
+        ValType Q = modularity( current_n_vertex, n_edges, c_size, m2,
+                        csr_ptr_d, csr_ind_d, csr_val_d, 
+                        cluster_d, cluster_inv_ptr, cluster_inv_ind, 
+                        weighted, k_vec_ptr, Q_arr_ptr, delta_Q_arr_ptr); // delta_Q_arr_ptr is temp_i
+        CUDA_CALL(cudaDeviceSynchronize());
+
+        LOG()<<Q<<std::endl;
+
+  
+        ouf<<"non block Q recompute "<< Q<<std::endl;
+                   
+        display_vec(Q_arr, ouf);
+        display_vec(delta_Q_arr, ouf);
+
+        ouf.close();
+
+#endif
+      } 
+
+      LOG()<<"Update vectors and variables\n";
+      
+      
+      if(cur_Q - new_Q && (bound < upper_bound)){
+        current_n_vertex = best_c_size;
+        n_edges = new_csr_ptr[ best_c_size ];
+        thrust::copy(thrust::device, new_csr_ptr.begin(), new_csr_ptr.begin() + current_n_vertex + 1, csr_ptr_d.begin()); 
+        thrust::copy(thrust::device, new_csr_ind.begin(), new_csr_ind.begin() + n_edges, csr_ind_d.begin());
+        thrust::copy(thrust::device, new_csr_val.begin(), new_csr_val.begin() + n_edges, csr_val_d.begin());
+      }
+
+      //cudaMemGetInfo(&mem_free, &mem_tot);
+      //std::cout<<"Mem usage : "<< (float)(mem_tot-mem_free)/(1<<30) <<std::endl;
+    }else {
+      LOG()<<"Didn't increase in modularity\n";
+      updated = false;
+      except --;
+    }
+    // end better   
+
+    
+    delta_Q_final = cur_Q - new_Q;
+
+    contin = ((delta_Q_final > 0.0001 || except >0) && (bound < upper_bound));
+
+    LOG()<<"======================= modularity: "<<COLOR_MGT<<new_Q<<COLOR_WHT<<" delta modularity: "<<delta_Q_final
+         << " runtime: "<<diff_time/1000<<" best_c_size: "<<best_c_size <<std::endl;  
+
+ 
+    ++bound;
+
+  } while(contin);
+
+#ifdef VERBOSE
+    display_vec(cluster_d);   
+    display_vec(csr_ptr_d);
+    display_vec(csr_ind_d);
+    display_vec(csr_val_d);
+
+#endif
+
+  //LOG()<<"Final modularity: "<<COLOR_MGT<<best_modularity<<COLOR_WHT<<std::endl;
+  log.clear();  
+  final_modularity = best_modularity;
+  cudaMemcpy ( cluster_vec, thrust::raw_pointer_cast(clustering.data()), n_vertex*sizeof(int), cudaMemcpyDefault );
+  return NVLOUVAIN_OK;
+}
+
+template<typename IdxType=int, typename ValType>
+NVLOUVAIN_STATUS louvain(IdxType* csr_ptr, IdxType* csr_ind, ValType* csr_val,
+                  const size_t num_vertex, const size_t num_edges, 
+                  bool& weighted, bool has_init_cluster,
+                  IdxType* init_cluster, // size = n_vertex
+                  ValType& final_modularity,
+                  std::vector< std::vector<int> >& cluster_vec,
+//                  std::vector< IdxType* >& cluster_vec,
+                  IdxType& num_level,
+                  std::ostream& log = std::cout){
+#ifndef ENABLE_LOG
+  log.setstate(std::ios_base::failbit);
+#endif
+  num_level = 0;
+  cusparseHandle_t cusp_handle;
+  cusparseCreate(&cusp_handle);
+
+  int n_edges = num_edges;
+  int n_vertex = num_vertex;
+
+  thrust::device_vector<IdxType> csr_ptr_d(csr_ptr, csr_ptr + n_vertex + 1);
+  thrust::device_vector<IdxType> csr_ind_d(csr_ind, csr_ind + n_edges);
+  thrust::device_vector<ValType> csr_val_d(csr_val, csr_val + n_edges);
+
+
+  int upper_bound = 100;
+
+  HighResClock hr_clock;
+  double timed, diff_time;
+
+  int c_size(n_vertex);
+  unsigned int best_c_size = (unsigned) n_vertex;
+  int current_n_vertex(n_vertex);
+  int num_aggregates(n_edges);
+  ValType m2 = thrust::reduce(thrust::cuda::par, csr_val_d.begin(), csr_val_d.begin() + n_edges);
+
+  ValType best_modularity = -1;
+
+  thrust::device_vector<IdxType> new_csr_ptr(n_vertex, 0);
+  thrust::device_vector<IdxType> new_csr_ind(n_edges, 0);
+  thrust::device_vector<ValType> new_csr_val(n_edges, 0);
+
+  thrust::device_vector<IdxType> cluster_d(n_vertex);
+  thrust::device_vector<IdxType> aggregates_tmp_d(n_vertex, 0);
+  thrust::device_vector<IdxType> cluster_inv_ptr(c_size + 1, 0);
+  thrust::device_vector<IdxType> cluster_inv_ind(n_vertex, 0);
+  thrust::device_vector<ValType> k_vec(n_vertex, 0);
+  thrust::device_vector<ValType> Q_arr(n_vertex, 0);
+  thrust::device_vector<ValType> delta_Q_arr(n_edges, 0);
+  thrust::device_vector<ValType> cluster_sum_vec(c_size, 0);
+  std::vector<IdxType> best_cluster_h(n_vertex, 0);
+  Vector<IdxType> aggregates(current_n_vertex, 0);
+
+  IdxType* cluster_inv_ptr_ptr = thrust::raw_pointer_cast(cluster_inv_ptr.data());
+  IdxType* cluster_inv_ind_ptr = thrust::raw_pointer_cast(cluster_inv_ind.data());
+  IdxType* csr_ptr_ptr = thrust::raw_pointer_cast(csr_ptr_d.data());
+  IdxType* csr_ind_ptr = thrust::raw_pointer_cast(csr_ind_d.data());
+  ValType* csr_val_ptr = thrust::raw_pointer_cast(csr_val_d.data());
+  IdxType* cluster_ptr = thrust::raw_pointer_cast(cluster_d.data());
+
+ 
+ 
+
+  if(!has_init_cluster){
+    // if there is no initialized cluster
+    // the cluster as assigned as a sequence (a cluster for each vertex)
+    // inv_clusters will also be 2 sequence
+    thrust::sequence(thrust::cuda::par, cluster_d.begin(), cluster_d.end());
+    thrust::sequence(thrust::cuda::par, cluster_inv_ptr.begin(), cluster_inv_ptr.end());
+    thrust::sequence(thrust::cuda::par, cluster_inv_ind.begin(), cluster_inv_ind.end());
+  }
+  else{
+    // assign initialized cluster to cluster_d device vector
+    // generate inverse cluster in CSR formate
+    if(init_cluster == nullptr){
+      final_modularity = -1;
+      return NVLOUVAIN_ERR_BAD_PARAMETERS;
+    }
+
+    thrust::copy(init_cluster, init_cluster + n_vertex , cluster_d.begin());
+    generate_cluster_inv(current_n_vertex, c_size, cluster_d.begin(), cluster_inv_ptr, cluster_inv_ind);
+  }
+ 
+  dim3 block_size_1d((n_vertex + BLOCK_SIZE_1D -1)/ BLOCK_SIZE_1D, 1, 1);
+  dim3 grid_size_1d(BLOCK_SIZE_1D, 1, 1); 
+  dim3 block_size_2d((n_vertex + BLOCK_SIZE_2D -1)/ BLOCK_SIZE_2D, (n_vertex + BLOCK_SIZE_2D -1)/ BLOCK_SIZE_2D, 1);
+  dim3 grid_size_2d(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1);
+
+  ValType* k_vec_ptr = thrust::raw_pointer_cast(k_vec.data());
+  ValType* Q_arr_ptr = thrust::raw_pointer_cast(Q_arr.data());
+  ValType* cluster_sum_vec_ptr = thrust::raw_pointer_cast(cluster_sum_vec.data());
+  ValType* delta_Q_arr_ptr =  thrust::raw_pointer_cast(delta_Q_arr.data());
+
+  ValType new_Q, cur_Q, delta_Q, delta_Q_final;
+  unsigned old_c_size(c_size); 
+  bool updated = true;
+
+  hr_clock.start();
+  // Get the initialized modularity
+  new_Q = modularity( n_vertex, n_edges, c_size, m2,
+              csr_ptr_ptr, csr_ind_ptr, csr_val_ptr,
+              cluster_ptr, cluster_inv_ptr_ptr, cluster_inv_ind_ptr, 
+              weighted, k_vec_ptr, Q_arr_ptr, delta_Q_arr_ptr); // delta_Q_arr_ptr is temp_i
+
+  hr_clock.stop(&timed);
+  diff_time = timed;
+
+  LOG()<<"Initial modularity value: "<<COLOR_MGT<<new_Q<<COLOR_WHT<<" runtime: "<<diff_time/1000<<"\n";  
+
+  bool contin(true);
+  int bound = 0;
+  int except = 3;
+
+  
+  do{ 
+    bound = 0;  
+    block_size_1d = dim3((current_n_vertex + BLOCK_SIZE_1D -1)/ BLOCK_SIZE_1D, 1, 1);
+    grid_size_1d  = dim3(BLOCK_SIZE_1D, 1, 1); 
+    cur_Q = new_Q;
+    old_c_size = c_size;
+
+#ifdef VERBOSE  
+      LOG()<<"Current cluster inv: \n";
+      nvlouvain::display_vec(cluster_inv_ptr, log);
+      nvlouvain::display_vec(cluster_inv_ind, log);
+#endif
+
+
+    hr_clock.start();
+    // Compute delta modularity for each edges
+    build_delta_modularity_vector(cusp_handle, current_n_vertex, c_size, m2, updated, 
+                                  csr_ptr_d, csr_ind_d, csr_val_d, 
+                                  cluster_d, 
+                                  cluster_inv_ptr_ptr, cluster_inv_ind_ptr, 
+                                  k_vec_ptr, cluster_sum_vec_ptr, delta_Q_arr_ptr);
+     
+    //display_vec(delta_Q_arr);
+    hr_clock.stop(&timed);
+    diff_time = timed;
+    LOG()<<"Complete build_delta_modularity_vector  runtime: "<<diff_time/1000<<"\n";
+
+    //  Start aggregates 
+    Matching_t config = nvlouvain::USER_PROVIDED;
+//    Size2Selector<IdxType, ValType> size2_sector(config, 0, 50, 0.6, true, false, 0);
+    Size2Selector<IdxType, ValType> size2_sector(config, 1, 25, 0.85, false, true, 0); 
+    //hollywood-2009 0.5
+
+
+#ifdef DEBUG
+    if((unsigned)cluster_d.size()!= current_n_vertex)
+      //LOG()<<"Error cluster_d.size()!= current_n_verte:qx"<< cluster_d.size() <<" != "<< current_n_vertex <<"\n";
+#endif 
+
+#ifdef VERBOSE
+    //LOG()<<"n_vertex: "<< csr_ptr_d.size()<<" "<<csr_ind_d.size()<< " " << csr_val_d.size()<<" a_size: "<<aggregates.size()<<std::endl;
+#endif
+
+    hr_clock.start();
+    size2_sector.setAggregates(cusp_handle, current_n_vertex, n_edges, csr_ptr_ptr, csr_ind_ptr, csr_val_ptr , aggregates, num_aggregates);
+    CUDA_CALL(cudaDeviceSynchronize());
+    hr_clock.stop(&timed);
+    diff_time = timed;
+
+    LOG()<<"Complete aggregation size: "<< num_aggregates<<" runtime: "<<diff_time/1000<<std::endl;
+
+    // Done aggregates 
+    c_size = num_aggregates;
+    thrust::copy(thrust::device, aggregates.begin(), aggregates.begin() + current_n_vertex, cluster_d.begin());
+    weighted = true;
+
+    // start update modularty 
+    hr_clock.start();
+    CUDA_CALL(cudaDeviceSynchronize());
+
+    generate_cluster_inv(current_n_vertex, c_size, cluster_d.begin(), cluster_inv_ptr, cluster_inv_ind);
+    CUDA_CALL(cudaDeviceSynchronize());
+
+    hr_clock.stop(&timed);
+    diff_time = timed;
+
+    LOG()<<"Complete generate_cluster_inv runtime: "<<diff_time/1000<<std::endl;
+
+
+#ifdef VERBOSE   
+      display_vec(cluster_inv_ptr, log);
+      display_vec(cluster_inv_ind, log);
+#endif
+
+
+    hr_clock.start();
+    new_Q = modularity(current_n_vertex, n_edges, c_size, m2,
+                       csr_ptr_ptr, csr_ind_ptr, csr_val_ptr,
+                       cluster_ptr, cluster_inv_ptr_ptr, cluster_inv_ind_ptr,
+                       weighted, k_vec_ptr, Q_arr_ptr, delta_Q_arr_ptr); //delta_Q_arr_ptr is temp_i and Q_arr is also temp store
+
+
+    hr_clock.stop(&timed);
+    diff_time = timed;
+    // Done update modularity
+   
+    delta_Q = new_Q - cur_Q; 
+    
+    if(best_modularity < new_Q){
+      best_c_size = c_size;
+    }
+
+    LOG()<<"modularity: "<<COLOR_MGT<<new_Q<<COLOR_WHT
+       <<" delta modularity: " <<delta_Q
+       <<" best_modularity:"<< min(best_modularity, new_Q)
+       <<" moved: "<< (old_c_size - best_c_size)
+       <<" runtime: "<<diff_time/1000<<std::endl;  
+
+
+    // start shinking graph
+    if(best_modularity < new_Q ){
+
+      LOG()<< "Start Update best cluster\n";
+      updated = true;
+      num_level ++;
+
+      thrust::copy(thrust::device, cluster_d.begin(), cluster_d.begin() + current_n_vertex, aggregates_tmp_d.begin());
+
+      // if we would like to record the best cluster assignment for each level 
+      // we push back current cluster assignment to cluster_vec
+    
+
+      best_cluster_h.resize(current_n_vertex);
+      thrust::copy( cluster_d.begin(), cluster_d.begin() + current_n_vertex, best_cluster_h.begin());
+      cudaCheckError();
+      cluster_vec.push_back(best_cluster_h);
+
+      best_modularity = new_Q;
+      best_c_size = c_size;
+
+      hr_clock.start(); 
+      // generate super vertices graph 
+      generate_superverticies_graph(current_n_vertex, best_c_size, 
+                                    csr_ptr_d, csr_ind_d, csr_val_d, 
+                                    new_csr_ptr, new_csr_ind, new_csr_val, 
+                                    aggregates_tmp_d);
+
+      CUDA_CALL(cudaDeviceSynchronize());
+      hr_clock.stop(&timed);
+      diff_time = timed;
+      LOG() <<"Complete generate_superverticies_graph size of graph: "<<current_n_vertex<<" -> "<<best_c_size<<" runtime: "<<diff_time/1000<<std::endl;
+  
+      // update cluster_d as a sequence
+      thrust::sequence(thrust::cuda::par, cluster_d.begin(), cluster_d.begin() + current_n_vertex);  
+      cudaCheckError(); 
+    
+      // generate cluster inv in CSR form as sequence
+      thrust::sequence(thrust::cuda::par, cluster_inv_ptr.begin(), cluster_inv_ptr.begin() + best_c_size+1);
+      thrust::sequence(thrust::cuda::par, cluster_inv_ind.begin(), cluster_inv_ind.begin() + best_c_size);
+
+      cluster_inv_ptr_ptr = thrust::raw_pointer_cast(cluster_inv_ptr.data());
+      cluster_inv_ind_ptr = thrust::raw_pointer_cast(cluster_inv_ind.data());
+
+      hr_clock.start(); 
+      // get new modularity after we generate super vertices. 
+      IdxType* new_csr_ptr_ptr = thrust::raw_pointer_cast(new_csr_ptr.data());
+      IdxType* new_csr_ind_ptr = thrust::raw_pointer_cast(new_csr_ind.data());
+      ValType* new_csr_val_ptr = thrust::raw_pointer_cast(new_csr_val.data());
+
+
+      new_Q = modularity( best_c_size, n_edges, best_c_size, m2,
+                          new_csr_ptr_ptr, new_csr_ind_ptr, new_csr_val_ptr,
+                          cluster_ptr, cluster_inv_ptr_ptr, cluster_inv_ind_ptr,
+                          weighted, k_vec_ptr, Q_arr_ptr, delta_Q_arr_ptr);
+      
+      hr_clock.stop(&timed);
+
+      diff_time = timed;
+ 
+      // modularity keeps the same after we generate super vertices 
+      // shouldn't happen
+      if(std::fabs(new_Q - best_modularity) > 0.0001){
+        
+        printf("Warning new_Q != best_Q %f != %f \n", new_Q, best_modularity);
+#if 0
+        printf("best_c_size = %d\n", best_c_size);
+
+        std::ofstream ouf("./log/Error_"+time_now()+".log");
+        display_vec(aggregates_tmp_d, ouf);
+        ouf<<"Error new_Q != best_Q "<< new_Q<<" != "<< best_modularity<<"\n";
+        ouf<<"old graph with size =  "<<current_n_vertex<< "\n";
+        display_vec(csr_ptr_d, ouf);  
+        display_vec(csr_ind_d, ouf);  
+        display_vec(csr_val_d, ouf);  
+ 
+        ouf<<"new graph \n";
+        display_vec(new_csr_ptr, ouf);
+        display_vec(new_csr_ind, ouf);
+        display_vec(new_csr_val, ouf);
+
+        generate_cluster_inv(current_n_vertex, c_size, aggregates_tmp_d.begin(), cluster_inv_ptr, cluster_inv_ind);
+
+        ValType Q = modularity( current_n_vertex, n_edges, c_size, m2,
+                        csr_ptr_d, csr_ind_d, csr_val_d, 
+                        cluster_d, cluster_inv_ptr, cluster_inv_ind, 
+                        weighted, k_vec_ptr, Q_arr_ptr, delta_Q_arr_ptr); // delta_Q_arr_ptr is temp_i
+        CUDA_CALL(cudaDeviceSynchronize());
+
+        LOG()<<Q<<std::endl;
+
+  
+        ouf<<"non block Q recompute "<< Q<<std::endl;
+                   
+        display_vec(Q_arr, ouf);
+        display_vec(delta_Q_arr, ouf);
+
+        ouf.close();
+
+#endif
+      } 
+
+      LOG()<<"Update vectors and variables\n";
+      
+      
+      if(cur_Q - new_Q && (bound < upper_bound)){
+        current_n_vertex = best_c_size;
+        n_edges = new_csr_ptr[ best_c_size ];
+        thrust::copy(thrust::device, new_csr_ptr.begin(), new_csr_ptr.begin() + current_n_vertex + 1, csr_ptr_d.begin()); 
+        thrust::copy(thrust::device, new_csr_ind.begin(), new_csr_ind.begin() + n_edges, csr_ind_d.begin());
+        thrust::copy(thrust::device, new_csr_val.begin(), new_csr_val.begin() + n_edges, csr_val_d.begin());
+      }
+
+    }else {
+      LOG()<<"Didn't increase in modularity\n";
+      updated = false;
+      except --;
+    }
+    // end better   
+
+    
+    delta_Q_final = cur_Q - new_Q;
+
+    contin = ((delta_Q_final > 0.0001 || except >0) && (bound < upper_bound));
+
+    LOG()<<"======================= modularity: "<<COLOR_MGT<<new_Q<<COLOR_WHT<<" delta modularity: "<<delta_Q_final
+         << " runtime: "<<diff_time/1000<<" best_c_size: "<<best_c_size <<std::endl;  
+
+ 
+    ++bound;
+
+  }while(contin);
+
+#ifdef VERBOSE
+    display_vec(cluster_d);   
+    display_vec(csr_ptr_d);
+    display_vec(csr_ind_d);
+    display_vec(csr_val_d);
+#endif
+
+  //LOG()<<"Final modularity: "<<COLOR_MGT<<best_modularity<<COLOR_WHT<<std::endl;
+  log.clear();  
+  final_modularity = best_modularity;
+  return NVLOUVAIN_OK; 
+}
+
+
+}
diff --git a/cpp/nvgraph/cpp/include/pagerank.hxx b/cpp/nvgraph/cpp/include/pagerank.hxx
new file mode 100644
index 00000000000..d9bbc8add18
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/pagerank.hxx
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+#pragma once
+
+namespace nvgraph
+{
+template <typename IndexType_, typename ValueType_>
+class Pagerank 
+{
+public: 
+    typedef IndexType_ IndexType;
+    typedef ValueType_ ValueType;
+
+private:
+	ValuedCsrGraph <IndexType, ValueType> m_network ;
+	Vector <ValueType> m_a;
+	Vector <ValueType> m_b;
+	Vector <ValueType> m_pagerank;
+	Vector <ValueType> m_tmp;
+	ValueType m_damping_factor;
+	ValueType m_residual;
+	ValueType m_tolerance;
+	cudaStream_t m_stream;
+	int m_iterations;
+	int m_max_it;
+	bool m_is_setup;
+	bool m_has_guess;
+
+	bool solve_it();
+	//void update_dangling_nodes(Vector<ValueType_>& dangling_nodes);
+	void setup(ValueType damping_factor, Vector<ValueType>& initial_guess, Vector<ValueType>& pagerank_vector);
+
+public:
+	// Simple constructor 
+	Pagerank(void) {};
+	// Simple destructor
+	~Pagerank(void) {};
+
+	// Create a Pagerank Solver attached to a the transposed of a transition matrix
+	// *** network is the transposed of a transition matrix***
+	Pagerank(const ValuedCsrGraph <IndexType, ValueType>& network, Vector<ValueType>& dangling_nodes, cudaStream_t stream = 0);
+	
+	// dangling_nodes is a vector of size n where dangling_nodes[i] = 1.0 if vertex i is a dangling node and 0.0 otherwise
+    // pagerank_vector is the output
+    //void solve(ValueType damping_factor, Vector<ValueType>& dangling_nodes, Vector<ValueType>& pagerank_vector);
+   // setup with an initial guess of the pagerank
+    NVGRAPH_ERROR solve(ValueType damping_factor, Vector<ValueType>& initial_guess, Vector<ValueType>& pagerank_vector, float tolerance =1.0E-6, int max_it = 500);
+    inline ValueType get_residual() const {return m_residual;}
+    inline int get_iterations() const {return m_iterations;}
+
+
+// init :
+// We need the transpose (=converse =reverse) in input (this can be seen as a CSC matrix that we see as CSR)
+// b is a constant and uniform vector, b = 1.0/num_vertices
+// a is a constant vector that initialy store the dangling nodes then we set : a = alpha*a + (1-alpha)e
+// pagerank is 0
+// tmp is random ( 1/n is fine)
+// alpha is a constant scalar (0.85 usually)
+
+//loop :
+//  pagerank = csrmv (network, tmp)
+//  scal(pagerank, alpha); //pagerank =  alpha*pagerank
+//  gamma  = dot(a, tmp); //gamma  = a*tmp
+//  pagerank = axpy(b, pagerank, gamma); // pagerank = pagerank+gamma*b
+
+// convergence check
+//  tmp = axpby(pagerank, tmp, -1, 1);	 // tmp = pagerank - tmp
+//  residual_norm = norm(tmp);               
+//  if converged (residual_norm)
+	  // l1 = l1_norm(pagerank);
+	  // pagerank = scal(pagerank, 1/l1);
+      // return pagerank 
+//  swap(tmp, pagerank)
+//end loop
+};
+
+} // end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/include/pagerank_kernels.hxx b/cpp/nvgraph/cpp/include/pagerank_kernels.hxx
new file mode 100644
index 00000000000..0391883a63a
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/pagerank_kernels.hxx
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+namespace nvgraph
+{	
+	template <typename ValueType_>
+    void update_dangling_nodes(int n, ValueType_* dangling_nodes, ValueType_ damping_factor,  cudaStream_t stream = 0);
+
+} // end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/include/partition.hxx b/cpp/nvgraph/cpp/include/partition.hxx
new file mode 100644
index 00000000000..29dd928a34c
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/partition.hxx
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "nvgraph_error.hxx"
+#include "valued_csr_graph.hxx"
+#include "matrix.hxx"
+
+
+namespace nvgraph {
+  #define SPECTRAL_USE_COLORING true
+  
+  #define SPECTRAL_USE_LOBPCG true 
+  #define SPECTRAL_USE_PRECONDITIONING true
+  #define SPECTRAL_USE_SCALING_OF_EIGVECS false
+  
+  #define SPECTRAL_USE_MAGMA false
+  #define SPECTRAL_USE_THROTTLE true
+  #define SPECTRAL_USE_NORMALIZED_LAPLACIAN true
+  #define SPECTRAL_USE_R_ORTHOGONALIZATION false
+
+  /// Spectral graph partition
+  /** Compute partition for a weighted undirected graph. This
+   *  partition attempts to minimize the cost function:
+   *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+   *
+   *  @param G Weighted graph in CSR format
+   *  @param nParts Number of partitions.
+   *  @param nEigVecs Number of eigenvectors to compute.
+   *  @param maxIter_lanczos Maximum number of Lanczos iterations.
+   *  @param restartIter_lanczos Maximum size of Lanczos system before
+   *    implicit restart.
+   *  @param tol_lanczos Convergence tolerance for Lanczos method.
+   *  @param maxIter_kmeans Maximum number of k-means iterations.
+   *  @param tol_kmeans Convergence tolerance for k-means algorithm.
+   *  @param parts (Output, device memory, n entries) Partition
+   *    assignments.
+   *  @param iters_lanczos On exit, number of Lanczos iterations
+   *    performed.
+   *  @param iters_kmeans On exit, number of k-means iterations
+   *    performed.
+   *  @return NVGRAPH error flag.
+   */
+  template <typename IndexType_, typename ValueType_>
+  NVGRAPH_ERROR partition( ValuedCsrGraph<IndexType_,ValueType_>& G,
+		       IndexType_ nParts,
+		       IndexType_ nEigVecs,
+		       IndexType_ maxIter_lanczos,
+		       IndexType_ restartIter_lanczos,
+		       ValueType_ tol_lanczos,
+		       IndexType_ maxIter_kmeans,
+		       ValueType_ tol_kmeans,
+		       IndexType_ * __restrict__ parts,
+           Vector<ValueType_> &eigVals,
+           Vector<ValueType_> &eigVecs,
+		       IndexType_ & iters_lanczos,
+		       IndexType_ & iters_kmeans);
+
+  template <typename IndexType_, typename ValueType_>
+  NVGRAPH_ERROR partition_lobpcg( ValuedCsrGraph<IndexType_,ValueType_>& G, Matrix<IndexType_,ValueType_> * M, cusolverDnHandle_t cusolverHandle,
+           IndexType_ nParts,
+           IndexType_ nEigVecs,
+           IndexType_ maxIter_lanczos,
+           ValueType_ tol_lanczos,
+           IndexType_ maxIter_kmeans,
+           ValueType_ tol_kmeans,
+           IndexType_ * __restrict__ parts,
+           Vector<ValueType_> &eigVals,
+           Vector<ValueType_> &eigVecs,
+           IndexType_ & iters_lanczos,
+           IndexType_ & iters_kmeans);
+
+
+  /// Compute cost function for partition
+  /** This function determines the edges cut by a partition and a cost
+   *  function:
+   *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+   *  Graph is assumed to be weighted and undirected.
+   *
+   *  @param G Weighted graph in CSR format
+   *  @param nParts Number of partitions.
+   *  @param parts (Input, device memory, n entries) Partition
+   *    assignments.
+   *  @param edgeCut On exit, weight of edges cut by partition.
+   *  @param cost On exit, partition cost function.
+   *  @return NVGRAPH error flag.
+   */
+  template <typename IndexType_, typename ValueType_>
+  NVGRAPH_ERROR analyzePartition(ValuedCsrGraph<IndexType_,ValueType_> & G,
+			      IndexType_ nParts,
+			      const IndexType_ * __restrict__ parts,
+			      ValueType_ & edgeCut, ValueType_ & cost);
+
+}
+
diff --git a/cpp/nvgraph/cpp/include/range_view.hxx b/cpp/nvgraph/cpp/include/range_view.hxx
new file mode 100644
index 00000000000..c3254e5eab4
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/range_view.hxx
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <thrust/device_vector.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/execution_policy.h>
+#include <iostream>
+
+#ifndef RANGE_VIEW_HXX
+#define RANGE_VIEW_HXX
+
+// This example demonstrates the use of a view: a non-owning wrapper for an
+// iterator range which presents a container-like interface to the user.
+//
+// For example, a view of a device_vector's data can be helpful when we wish to
+// access that data from a device function. Even though device_vectors are not
+// accessible from device functions, the range_view class allows us to access
+// and manipulate its data as if we were manipulating a real container.
+//
+
+// This example demonstrate use of range_view with for_each algorithm which is
+// dispatch from GPU
+//
+
+template<class Iterator>
+class range_view
+{
+public:
+  typedef Iterator iterator;
+  typedef typename thrust::iterator_traits<iterator>::value_type value_type;
+  typedef typename thrust::iterator_traits<iterator>::pointer pointer;
+  typedef typename thrust::iterator_traits<iterator>::difference_type difference_type;
+  typedef typename thrust::iterator_traits<iterator>::reference reference;
+
+private:
+  const iterator first;
+  const iterator last;
+
+
+public:
+  __host__ __device__
+  range_view(Iterator first, Iterator last)
+      : first(first), last(last) {}
+  __host__ __device__
+  ~range_view() {}
+
+  __host__ __device__
+  difference_type size() const { return thrust::distance(first, last); }
+
+
+  __host__ __device__
+  reference operator[](difference_type n)
+  {
+    return *(first + n);
+  }
+  __host__ __device__
+  const reference operator[](difference_type n) const
+  {
+    return *(first + n);
+  }
+
+  __host__ __device__
+  iterator begin() 
+  {
+    return first;
+  }
+  __host__ __device__
+  const iterator cbegin() const
+  {
+    return first;
+  }
+  __host__ __device__
+  iterator end() 
+  {
+    return last;
+  }
+  __host__ __device__
+  const iterator cend() const
+  {
+    return last;
+  }
+
+
+  __host__ __device__
+  thrust::reverse_iterator<iterator> rbegin()
+  {
+    return thrust::reverse_iterator<iterator>(end());
+  }
+  __host__ __device__
+  const thrust::reverse_iterator<const iterator> crbegin() const 
+  {
+    return thrust::reverse_iterator<const iterator>(cend());
+  }
+  __host__ __device__
+  thrust::reverse_iterator<iterator> rend()
+  {
+    return thrust::reverse_iterator<iterator>(begin());
+  }
+  __host__ __device__
+  const thrust::reverse_iterator<const iterator> crend() const 
+  {
+    return thrust::reverse_iterator<const iterator>(cbegin());
+  }
+  __host__ __device__
+  reference front() 
+  {
+    return *begin();
+  }
+  __host__ __device__
+  const reference front()  const
+  {
+    return *cbegin();
+  }
+
+  __host__ __device__
+  reference back() 
+  {
+    return *end();
+  }
+  __host__ __device__
+  const reference back()  const
+  {
+    return *cend();
+  }
+
+  __host__ __device__
+  bool empty() const 
+  {
+    return size() == 0;
+  }
+
+};
+
+// This helper function creates a range_view from iterator and the number of
+// elements
+template <class Iterator, class Size>
+range_view<Iterator>
+__host__ __device__
+make_range_view(Iterator first, Size n)
+{
+  return range_view<Iterator>(first, first+n);
+}
+
+// This helper function creates a range_view from a pair of iterators
+template <class Iterator>
+range_view<Iterator>
+__host__ __device__
+make_range_view(Iterator first, Iterator last)
+{
+  return range_view<Iterator>(first, last);
+}
+
+// This helper function creates a range_view from a Vector
+template <class Vector>
+range_view<typename Vector::iterator>
+__host__
+make_range_view(Vector& v)
+{
+  return range_view<typename Vector::iterator>(v.begin(), v.end());
+}
+
+#endif
diff --git a/cpp/nvgraph/cpp/include/semiring.hxx b/cpp/nvgraph/cpp/include/semiring.hxx
new file mode 100644
index 00000000000..7ecc366fc38
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/semiring.hxx
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <cfloat>
+#include <algorithm>
+#include <stdio.h>
+#include "atomics.hxx"
+#include "nvgraph_error.hxx"
+
+namespace nvgraph{
+//define nvgraph min and max oprators
+template<typename T>
+__host__ __device__ __forceinline__ T min(const T&a, const T &b)
+{
+	return (a < b) ? a : b;
+}
+
+template<typename T>
+__host__ __device__ __forceinline__ T max(const T&a, const T &b)
+{
+	return (a > b) ? a : b;
+}
+
+//have routines to return these operators
+template<typename ValueType_> //ValueType_ is Value_type of the graph
+struct PlusTimesSemiring
+{
+	typedef ValueType_ SR_type;
+	SR_type plus_ident, times_ident, times_null;
+	PlusTimesSemiring()
+	{
+		if (typeid(ValueType_) != typeid(float) && typeid(ValueType_) != typeid(double))
+			FatalError("Graph value type is not supported by this semiring.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+		//for semiring need multiplicative and additive identity
+		plus_ident = SR_type(0);
+		times_ident = SR_type(1);
+		//also need multiplicative null
+		times_null = SR_type(0);
+	}
+	__host__ __device__ __forceinline__ void setPlus_ident(SR_type &val) 
+	{
+		val = SR_type(0);
+	}
+
+	__host__ __device__ __forceinline__ SR_type plus(const SR_type &arg0, const SR_type &arg1)
+	{
+		return arg0 + arg1;
+	}
+	__host__ __device__ __forceinline__ SR_type times(const SR_type &arg0, const SR_type &arg1)
+	{
+		return arg0 * arg1;
+	}
+	//potential private member to be used in reduction by key so only need atomic for plus operator
+	__device__ __forceinline__ void atomicPlus(SR_type *addr, SR_type val)
+	{
+		atomicFPAdd(addr, val);
+	}
+	__device__ __forceinline__ SR_type shflPlus(SR_type input, int firstLane, int offset)
+	{
+		return shflFPAdd(input, firstLane, offset);
+	}
+};
+
+template<typename ValueType_>
+struct MinPlusSemiring
+{
+	typedef ValueType_ SR_type; //possibly change for integers to cast to floats
+	SR_type plus_ident, times_ident, times_null;
+	MinPlusSemiring()
+	{
+		if (typeid(ValueType_) != typeid(float) && typeid(ValueType_) != typeid(double))
+			FatalError("Graph value type is not supported by this semiring.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+		//for semiring need multiplicative and additive identity//put in constructor
+		SR_type inf = (typeid(ValueType_) == typeid(float)) ? FLT_MAX : DBL_MAX; //check for cuda add type identifiers
+		plus_ident = SR_type(inf);
+		times_ident = SR_type(0);
+		//also need multiplicative null
+		times_null = SR_type(inf);
+	}
+	__host__ __device__ __forceinline__ void setPlus_ident(float &val) 
+	{
+		val = FLT_MAX;
+	}
+
+	__host__ __device__ __forceinline__ void setPlus_ident(double &val) 
+	{
+		val = DBL_MAX;
+	}
+
+	__host__ __device__ __forceinline__ SR_type plus(const SR_type &arg0, const SR_type &arg1)
+	{
+		return min(arg0, arg1); //check and change!-using min in csrmv.cu
+	}
+	__host__ __device__ __forceinline__ SR_type times(const SR_type &arg0, const SR_type &arg1)
+	{
+		return arg0 + arg1;
+	}
+	//potential private member to be used in reduction by key so only need atomic for plus operator
+	__device__ __forceinline__ void atomicPlus(SR_type *addr, SR_type val)
+	{
+		atomicFPMin(addr, val);
+	}
+	__device__ __forceinline__ SR_type shflPlus(SR_type input, int firstLane, int offset)
+	{
+		return shflFPMin(input, firstLane, offset);
+	}
+};
+
+template<typename ValueType_>
+struct MaxMinSemiring //bottleneck semiring
+{
+	typedef ValueType_ SR_type;//could be integers template and check that type makes sense
+	SR_type plus_ident, times_ident, times_null;
+	MaxMinSemiring()
+	{
+		if (typeid(ValueType_) != typeid(float) && typeid(ValueType_) != typeid(double))
+			FatalError("Graph value type is not supported by this semiring.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+		//for semiring need multiplicative and additive identity
+		SR_type inf = (typeid(ValueType_) == typeid(float)) ? FLT_MAX : DBL_MAX;
+		plus_ident = SR_type(-inf);
+		times_ident = SR_type(inf);
+		//also need multiplicative null
+		times_null = SR_type(-inf);
+	}
+	__host__ __device__ __forceinline__ void setPlus_ident(float &val) 
+	{
+		val = -FLT_MAX;
+	}
+
+	__host__ __device__ __forceinline__ void setPlus_ident(double &val) 
+	{
+		val = -DBL_MAX;
+	}
+
+	__host__ __device__ __forceinline__ SR_type plus(const SR_type &arg0, const SR_type &arg1)
+	{
+		return max(arg0, arg1); //check and change!-using min in csrmv.cu can use thrust
+	}
+	__host__ __device__ __forceinline__ SR_type times(const SR_type &arg0, const SR_type &arg1)
+	{
+		return min(arg0,arg1);
+	}
+	//potential private member to be used in reduction by key so only need atomic for plus operator
+	__device__ __forceinline__ void atomicPlus(SR_type *addr, SR_type val)
+	{
+		atomicFPMax(addr, val);
+	}
+	__device__ __forceinline__ SR_type shflPlus(SR_type input, int firstLane, int offset)
+	{
+		return shflFPMax(input, firstLane, offset);
+	}
+};
+
+template<typename ValueType_>
+struct OrAndBoolSemiring //bottleneck semiring
+{
+	typedef ValueType_ SR_type;//could be integers
+	SR_type plus_ident, times_ident, times_null;
+	OrAndBoolSemiring()
+	{
+		//embed the bools in the reals just use 0 and 1 in floats
+		if (typeid(ValueType_) != typeid(float) && typeid(ValueType_) != typeid(double))
+			FatalError("Graph value type is not supported by this semiring.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+		//for semiring need multiplicative and additive identity
+		plus_ident = SR_type(0);
+		times_ident = SR_type(1);
+		//also need multiplicative null
+		times_null = SR_type(0);
+	}
+	__host__ __device__ __forceinline__ void setPlus_ident(SR_type &val) 
+	{
+		val = SR_type(0);
+	}
+
+	__host__ __device__ __forceinline__ SR_type plus(const SR_type &arg0, const SR_type &arg1)
+	{
+		return (bool) arg0 | (bool) arg1; //check and change!-using min in csrmv.cu can use thrust
+	}
+	__host__ __device__ __forceinline__ SR_type times(const SR_type &arg0, const SR_type &arg1)
+	{
+		return (bool) arg0 & (bool) arg1;
+	}
+	//potential private member to be used in reduction by key so only need atomic for plus operator
+	//need to check this atomic since it takes integer parameters instead of boolean
+	__device__ __forceinline__ void atomicPlus(SR_type *addr, SR_type val)
+	{
+		atomicFPOr(addr, val);
+	}
+	//DOESN"T work returns exclusive or
+	__device__ __forceinline__ SR_type shflPlus(SR_type input, int firstLane, int offset)
+	{
+		return shflFPOr(input, firstLane, offset);
+	}
+};
+//This Semiring does not work. WIll not be supported in first version
+template<typename ValueType_>
+struct LogPlusSemiring //bottleneck semiring
+{
+	typedef ValueType_ SR_type;//could be integers
+	SR_type plus_ident, times_ident, times_null;
+	LogPlusSemiring()
+	{
+		//for semiring need multiplicative and additive identity
+		if (typeid(ValueType_) != typeid(float) && typeid(ValueType_) != typeid(double))
+			FatalError("Graph value type is not supported by this semiring.", NVGRAPH_ERR_BAD_PARAMETERS);
+		
+		SR_type inf = (typeid(ValueType_) == typeid(float)) ? FLT_MAX : DBL_MAX;
+		plus_ident = SR_type(inf);
+		times_ident = SR_type(0);
+		//also need multiplicative null
+		times_null = SR_type(inf);
+	}
+
+	__host__ __device__ __forceinline__ void setPlus_ident(float &val) 
+	{
+		val = FLT_MAX;
+	}
+
+	__host__ __device__ __forceinline__ void setPlus_ident(double &val) 
+	{
+		val = DBL_MAX;
+	}
+
+	__host__ __device__ __forceinline__ SR_type plus(const SR_type &arg0, const SR_type &arg1)
+	{
+		return -log(exp(-arg0) + exp(-arg1)); //check calling cuda log and arg0 ok for float not double?
+	}
+	__host__ __device__ __forceinline__ SR_type times(const SR_type &arg0, const SR_type &arg1)
+	{
+		return arg0 + arg1;
+	}
+	//this will not work!
+	__device__ __forceinline__ void atomicPlus(SR_type *addr, SR_type val)
+	{
+		atomicFPLog(addr, val);
+	}
+	//this DOES NOT work! Need customized shfl isntructions for logPlus
+	__device__ __forceinline__ SR_type shflPlus(SR_type input, int firstLane, int offset)
+	{
+		return shflFPAdd(input, firstLane, offset);
+	}
+};
+
+}// end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/include/shfl.hxx b/cpp/nvgraph/cpp/include/shfl.hxx
new file mode 100644
index 00000000000..0341606b3ba
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/shfl.hxx
@@ -0,0 +1,450 @@
+ /*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "sm_utils.h"
+
+namespace nvgraph{
+
+    __device__ __forceinline__ float shflFPAdd(
+        float           input,              //Calling thread's input item.
+        int             firstLane,         //Index of first lane in segment
+        int             offset,             //Upstream offset to pull from
+        int             mask = DEFAULT_MASK) // lane mask for operation
+    {
+        float output;
+        // Use predicate set from SHFL to guard against invalid peers
+#if USE_CG
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(firstLane), "f"(input), "r"(mask));
+
+#else
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.up.b32 r0|p, %1, %2, %3;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(firstLane), "f"(input));
+#endif
+
+ 		return output;
+
+    }
+
+    //incorporate into cusparse and try to remove
+    // Inclusive prefix scan step speciliazed for summation of doubles
+    __device__ __forceinline__ double shflFPAdd(
+        double          input,              //Calling thread's input item.
+        int             firstLane,         //Index of first lane in segment
+        int             offset,             //Upstream offset to pull from
+        int             mask = DEFAULT_MASK) // lane mask for operation
+    {
+        double output;
+
+        // Use predicate set from SHFL to guard against invalid peers
+#if USE_CG
+        asm volatile(
+            "{"
+            "  .reg .f64 r0;"
+            "  .reg .pred p;"
+            "  {"
+            "    .reg .u32 lo;"
+            "    .reg .u32 hi;"
+            "    mov.b64 {lo, hi}, %1;"
+            "    shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
+            "    shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
+            "    mov.b64 r0, {lo, hi};"
+            "  }"
+            "  @p add.f64 r0, r0, %4;"
+            "  mov.f64 %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(firstLane), "d"(input), "r"(mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .f64 r0;"
+            "  .reg .pred p;"
+            "  {"
+            "    .reg .u32 lo;"
+            "    .reg .u32 hi;"
+            "    mov.b64 {lo, hi}, %1;"
+            "    shfl.up.b32 lo|p, lo, %2, %3;"
+            "    shfl.up.b32 hi|p, hi, %2, %3;"
+            "    mov.b64 r0, {lo, hi};"
+            "  }"
+            "  @p add.f64 r0, r0, %4;"
+            "  mov.f64 %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(firstLane), "d"(input));
+#endif
+
+        return output;
+    }
+
+    __device__ __forceinline__ float shflFPMin(
+        float           input,              //Calling thread's input item.
+        int             firstLane,         //Index of first lane in segment
+        int             offset,             //Upstream offset to pull from
+        int             mask = DEFAULT_MASK) // lane mask for operation
+    {
+        float output;
+        //if (threadIdx.x + blockDim.x*blockIdx.x < 4)device_printf("Thread = %d %f\n", threadIdx.x + blockDim.x*blockIdx.x, input);
+        // Use predicate set from SHFL to guard against invalid peers
+#if USE_CG
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
+            "  @p min.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(firstLane), "f"(input), "r"(mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.up.b32 r0|p, %1, %2, %3;"
+            "  @p min.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(firstLane), "f"(input));
+#endif
+        return output;
+    }
+
+    //incorporate into cusparse and try to remove
+    // Inclusive prefix scan step speciliazed for summation of doubles
+    __device__ __forceinline__ double shflFPMin(
+        double          input,              //Calling thread's input item.
+        int             firstLane,         //Index of first lane in segment
+        int             offset,             //Upstream offset to pull from
+        int             mask = DEFAULT_MASK) // lane mask for operation
+    {
+        double output;
+
+        // Use predicate set from SHFL to guard against invalid peers
+#if USE_CG
+        asm volatile(
+            "{"
+            "  .reg .f64 r0;"
+            "  .reg .pred p;"
+            "  {"
+            "    .reg .u32 lo;"
+            "    .reg .u32 hi;"
+            "    mov.b64 {lo, hi}, %1;"
+            "    shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
+            "    shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
+            "    mov.b64 r0, {lo, hi};"
+            "  }"
+            "  @p min.f64 r0, r0, %4;"
+            "  mov.f64 %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(firstLane), "d"(input), "r"(mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .f64 r0;"
+            "  .reg .pred p;"
+            "  {"
+            "    .reg .u32 lo;"
+            "    .reg .u32 hi;"
+            "    mov.b64 {lo, hi}, %1;"
+            "    shfl.up.b32 lo|p, lo, %2, %3;"
+            "    shfl.up.b32 hi|p, hi, %2, %3;"
+            "    mov.b64 r0, {lo, hi};"
+            "  }"
+            "  @p min.f64 r0, r0, %4;"
+            "  mov.f64 %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(firstLane), "d"(input));
+#endif
+
+        return output;
+    }
+
+    __device__ __forceinline__ float shflFPMax(
+        float           input,              //Calling thread's input item.
+        int             firstLane,         //Index of first lane in segment
+        int             offset,             //Upstream offset to pull from
+        int             mask = DEFAULT_MASK) // lane mask for operation
+    {
+        float output;
+        //if (threadIdx.x + blockDim.x*blockIdx.x < 4)device_printf("Thread = %d %f\n", threadIdx.x + blockDim.x*blockIdx.x, input);
+        // Use predicate set from SHFL to guard against invalid peers
+#if USE_CG
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
+            "  @p max.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(firstLane), "f"(input), "r"(mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.up.b32 r0|p, %1, %2, %3;"
+            "  @p max.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(firstLane), "f"(input));
+#endif
+        return output;
+   
+        //return output;
+    }
+
+    //incorporate into cusparse and try to remove
+    // Inclusive prefix scan step speciliazed for summation of doubles
+    __device__ __forceinline__ double shflFPMax(
+        double          input,              //Calling thread's input item.
+        int             firstLane,         //Index of first lane in segment
+        int             offset,             //Upstream offset to pull from
+        int             mask = DEFAULT_MASK) // lane mask for operation
+    {
+        double output;
+
+        // Use predicate set from SHFL to guard against invalid peers
+#if USE_CG
+        asm volatile(
+            "{"
+            "  .reg .f64 r0;"
+            "  .reg .pred p;"
+            "  {"
+            "    .reg .u32 lo;"
+            "    .reg .u32 hi;"
+            "    mov.b64 {lo, hi}, %1;"
+            "    shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
+            "    shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
+            "    mov.b64 r0, {lo, hi};"
+            "  }"
+            "  @p max.f64 r0, r0, %4;"
+            "  mov.f64 %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(firstLane), "d"(input), "r"(mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .f64 r0;"
+            "  .reg .pred p;"
+            "  {"
+            "    .reg .u32 lo;"
+            "    .reg .u32 hi;"
+            "    mov.b64 {lo, hi}, %1;"
+            "    shfl.up.b32 lo|p, lo, %2, %3;"
+            "    shfl.up.b32 hi|p, hi, %2, %3;"
+            "    mov.b64 r0, {lo, hi};"
+            "  }"
+            "  @p max.f64 r0, r0, %4;"
+            "  mov.f64 %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(firstLane), "d"(input));
+#endif
+
+        return output;
+    }
+
+    __device__ __forceinline__ float shflFPOr(
+        float           input,              //Calling thread's input item.
+        int             firstLane,         //Index of first lane in segment
+        int             offset,             //Upstream offset to pull from
+        int             mask = DEFAULT_MASK) // lane mask for operation
+    {
+        float output;
+        //if (threadIdx.x + blockDim.x*blockIdx.x < 4)device_printf("Thread = %d %f\n", threadIdx.x + blockDim.x*blockIdx.x, input);
+        // Use predicate set from SHFL to guard against invalid peers
+#if USE_CG
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
+            "  @p or.b32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(firstLane), "f"(input), "r"(mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.up.b32 r0|p, %1, %2, %3;"
+            "  @p or.b32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(firstLane), "f"(input));
+#endif
+   
+        return output;
+    }
+
+    __device__ __forceinline__ double shflFPOr(
+        double          input,              //Calling thread's input item.
+        int             firstLane,         //Index of first lane in segment
+        int             offset,             //Upstream offset to pull from
+        int             mask = DEFAULT_MASK) // lane mask for operation
+    {
+        double output;
+
+        // Use predicate set from SHFL to guard against invalid peers
+#if USE_CG
+        asm volatile(
+            "{"
+            "  .reg .f64 r0;"
+            "  .reg .pred p;"
+            "  {"
+            "    .reg .u32 lo;"
+            "    .reg .u32 hi;"
+            "    mov.b64 {lo, hi}, %1;"
+            "    shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
+            "    shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
+            "    mov.b64 r0, {lo, hi};"
+            "  }"
+            "  @p or.b64 r0, r0, %4;"
+            "  mov.f64 %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(firstLane), "d"(input), "r"(mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .f64 r0;"
+            "  .reg .pred p;"
+            "  {"
+            "    .reg .u32 lo;"
+            "    .reg .u32 hi;"
+            "    mov.b64 {lo, hi}, %1;"
+            "    shfl.up.b32 lo|p, lo, %2, %3;"
+            "    shfl.up.b32 hi|p, hi, %2, %3;"
+            "    mov.b64 r0, {lo, hi};"
+            "  }"
+            "  @p or.b64 r0, r0, %4;"
+            "  mov.f64 %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(firstLane), "d"(input));
+#endif
+
+        return output;
+    }
+//Need to write correct instructions in asm for the operation -log(exp(-x) + exp(-y))
+ __device__ __forceinline__ float shflFPLog(
+        float           input,              //Calling thread's input item.
+        int             firstLane,         //Index of first lane in segment
+        int             offset,             //Upstream offset to pull from
+        int             mask = DEFAULT_MASK) // lane mask for operation
+    {
+        float output;
+        float expinput = expf(-input); //this must be shuffled and adding
+        float baseChange = log2(expf(1.0)); //for change of base formaula
+        // Use predicate set from SHFL to guard against invalid peers
+#if USE_CG
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "  @p lg2.approx.f32  %0, r0;" //convert to natural logarithm!!
+            //add another variable for e in change of base compute log_e(x) = log_2(x) / log_2(e) 
+            "  @p neg.f32  %0, r0;"
+            "}"
+            : "=f"(output) : "f"(expinput), "r"(offset), "r"(firstLane), "f"(expinput), "r"(mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.up.b32 r0|p, %1, %2, %3;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "  @p lg2.approx.f32  %0, r0;" //convert to natural logarithm!!
+            //add another variable for e in change of base compute log_e(x) = log_2(x) / log_2(e) 
+            "  @p neg.f32  %0, r0;"
+            "}"
+            : "=f"(output) : "f"(expinput), "r"(offset), "r"(firstLane), "f"(expinput));
+#endif
+        return output;
+    }
+//check this!!
+    __device__ __forceinline__ double shflFPLog(
+        double          input,              //Calling thread's input item.
+        int             firstLane,         //Index of first lane in segment
+        int             offset,             //Upstream offset to pull from
+        int             mask = DEFAULT_MASK) // lane mask for operation
+    {
+        double output;
+        double expinput = exp(-input);
+        double baseChange = log2(exp(1.0));//divide byt his
+
+        // Use predicate set from SHFL to guard against invalid peers
+#if USE_CG
+        asm volatile(
+            "{"
+            "  .reg .f64 r0;"
+            "  .reg .pred p;"        
+            "  {"
+            "    .reg .u32 lo;"
+            "    .reg .u32 hi;"
+            "    mov.b64 {lo, hi}, %1;"
+            "    shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
+            "    shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
+            "    mov.b64 r0, {lo, hi};"
+            "  }"
+            "  @p add.f64 r0, r0, %4;"
+            "  mov.f64 %0, r0;"
+           // "  @p lg2.approx.f32  %0, r0;" //f64 not supported!!
+            "  @p neg.f64  %0, r0;"
+            "}"
+            : "=d"(output) : "d"(expinput), "r"(offset), "r"(firstLane), "d"(expinput), "r"(mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .f64 r0;"
+            "  .reg .pred p;"        
+            "  {"
+            "    .reg .u32 lo;"
+            "    .reg .u32 hi;"
+            "    mov.b64 {lo, hi}, %1;"
+            "    shfl.up.b32 lo|p, lo, %2, %3;"
+            "    shfl.up.b32 hi|p, hi, %2, %3;"
+            "    mov.b64 r0, {lo, hi};"
+            "  }"
+            "  @p add.f64 r0, r0, %4;"
+            "  mov.f64 %0, r0;"
+           // "  @p lg2.approx.f32  %0, r0;" //f64 not supported!!
+            "  @p neg.f64  %0, r0;"
+            "}"
+            : "=d"(output) : "d"(expinput), "r"(offset), "r"(firstLane), "d"(expinput));
+#endif
+
+        return output;
+    }
+
+} //end namespace
+
diff --git a/cpp/nvgraph/cpp/include/size2_selector.cuh b/cpp/nvgraph/cpp/include/size2_selector.cuh
new file mode 100644
index 00000000000..c8d5b4bcd64
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/size2_selector.cuh
@@ -0,0 +1,386 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+#pragma once
+
+#include <thrust/device_vector.h>
+#include <thrust/count.h> //count
+#include <thrust/sort.h> //sort
+#include <thrust/binary_search.h> //lower_bound
+#include <thrust/unique.h> //unique
+#include <cusparse.h>
+#include "async_event.cuh"
+#include "graph_utils.cuh"
+#include "common_selector.cuh"
+#include "valued_csr_graph.cuh"
+
+
+// This should be enabled
+#define EXPERIMENTAL_ITERATIVE_MATCHING
+
+using namespace nvlouvain;
+
+namespace nvlouvain{
+
+typedef enum
+{
+   USER_PROVIDED = 0, // using edge values as is
+   SCALED_BY_ROW_SUM   = 1,  // 0.5*(A_ij+A_ji)/max(d(i),d (j)), where d(i) is the sum of the row i
+   SCALED_BY_DIAGONAL   = 2,  // 0.5*(A_ij+A_ji)/max(diag(i),diag(j)) 
+}Matching_t;
+
+typedef enum{
+  NVGRAPH_OK = 0,
+  NVGRAPH_ERR_BAD_PARAMETERS = 1,
+}NVGRAPH_ERROR;
+
+
+
+template <typename IndexType, typename ValueType>
+class Size2Selector
+{
+
+  public:
+
+    Size2Selector();
+
+    Size2Selector(Matching_t similarity_metric,  int deterministic = 1, int max_iterations = 15 , ValueType numUnassigned_tol = 0.05 ,bool two_phase = false, bool merge_singletons = true, cudaStream_t stream = 0) 
+       :m_similarity_metric(similarity_metric), m_deterministic(deterministic), m_max_iterations(max_iterations), m_numUnassigned_tol(numUnassigned_tol), m_two_phase(two_phase), m_merge_singletons(merge_singletons), m_stream(stream)
+    {
+        m_aggregation_edge_weight_component = 0;
+        m_weight_formula = 0;
+    }
+
+//    NVGRAPH_ERROR setAggregates(const CsrGraph<IndexType, ValueType> &A, Vector<IndexType> &aggregates, int &num_aggregates);
+    NVGRAPH_ERROR setAggregates(cusparseHandle_t, const IndexType n_vertex, const IndexType n_edges, IndexType* csr_ptr, IndexType* csr_ind, ValueType* csr_val, Vector<IndexType> &aggregates, int &num_aggregates);
+
+
+  protected:
+//    NVGRAPH_ERROR setAggregates_common_sqblocks(const CsrGraph<IndexType, ValueType> &A, Vector<IndexType> &aggregates, int &num_aggregates);
+    NVGRAPH_ERROR setAggregates_common_sqblocks(cusparseHandle_t, const IndexType n_vertex, const IndexType n_edges, IndexType* csr_ptr, IndexType* csr_ind, ValueType* csr_val, Vector<IndexType> &aggregates, int &num_aggregates);
+ 
+    Matching_t m_similarity_metric;
+    int m_deterministic;
+    int m_max_iterations;
+    ValueType m_numUnassigned_tol;
+    bool m_two_phase;
+    bool m_merge_singletons;
+    cudaStream_t m_stream;    
+    int m_aggregation_edge_weight_component;
+    int m_weight_formula;
+};
+
+}
+
+
+template <typename IndexType>
+void renumberAndCountAggregates(Vector<IndexType> &aggregates, const IndexType n, IndexType& num_aggregates)
+{
+  // renumber aggregates
+  Vector<IndexType> scratch(n+1);
+  scratch.fill(0);
+  thrust::device_ptr<IndexType> aggregates_thrust_dev_ptr(aggregates.raw());
+  thrust::device_ptr<IndexType> scratch_thrust_dev_ptr(scratch.raw());
+
+  // set scratch[aggregates[i]] = 1
+  thrust::fill(thrust::make_permutation_iterator(scratch_thrust_dev_ptr, aggregates_thrust_dev_ptr),
+               thrust::make_permutation_iterator(scratch_thrust_dev_ptr, aggregates_thrust_dev_ptr + n), 1);
+  //scratch.dump(0,scratch.get_size());
+
+  // do prefix sum on scratch
+  thrust::exclusive_scan(scratch_thrust_dev_ptr, scratch_thrust_dev_ptr + n + 1, scratch_thrust_dev_ptr);
+ // scratch.dump(0,scratch.get_size());
+
+  // aggregates[i] = scratch[aggregates[i]]
+  thrust::copy(thrust::make_permutation_iterator(scratch_thrust_dev_ptr, aggregates_thrust_dev_ptr),
+               thrust::make_permutation_iterator(scratch_thrust_dev_ptr, aggregates_thrust_dev_ptr + n),
+               aggregates_thrust_dev_ptr);
+  cudaCheckError();
+  cudaMemcpy(&num_aggregates, &scratch.raw()[scratch.get_size()-1], sizeof(int), cudaMemcpyDefault); //num_aggregates = scratch.raw()[scratch.get_size()-1];
+  cudaCheckError();
+
+}
+
+// ------------------
+// Constructors
+// ------------------
+
+template <typename IndexType, typename ValueType>
+Size2Selector<IndexType, ValueType>::Size2Selector()
+{
+  //Using default vaues from AmgX
+  m_deterministic = 1;
+  m_stream=0;
+  m_max_iterations = 15;
+  m_numUnassigned_tol = 0.05;
+  m_two_phase =  0;
+  m_aggregation_edge_weight_component= 0;
+  m_merge_singletons = 1;
+  m_weight_formula = 0;
+  m_similarity_metric = SCALED_BY_ROW_SUM;
+}
+
+// ------------------
+// Methods
+// ------------------
+
+// setAggregates for block_dia_csr_matrix_d format
+template <typename IndexType, typename ValueType>
+NVGRAPH_ERROR Size2Selector<IndexType, ValueType>::setAggregates_common_sqblocks(
+cusparseHandle_t cusp_handle,
+const IndexType n_vertex,
+const IndexType n_edges, 
+IndexType *csr_ptr,
+IndexType *csr_ind,
+ValueType *csr_val, 
+Vector<IndexType> &aggregates, int &num_aggregates)
+{
+  const IndexType n = n_vertex;
+  const IndexType nnz = n_edges;
+  const IndexType *A_row_offsets_ptr = csr_ptr;
+  const IndexType *A_column_indices_ptr = csr_ind;
+  const ValueType *A_nonzero_values_ptr = csr_val;
+  
+  // compute row indices
+  Vector<IndexType> row_indices(nnz);
+  IndexType* row_indices_raw_ptr = row_indices.raw();
+//  Cusparse::csr2coo( n, nnz, A_row_offsets_ptr, row_indices.raw()); // note : amgx uses cusp for that
+  //cusparseHandle_t cusp_handle;
+  //cusparseCreate(&cusp_handle);
+
+  cusparseXcsr2coo(cusp_handle, A_row_offsets_ptr,
+                 nnz, n, row_indices_raw_ptr, 
+                 CUSPARSE_INDEX_BASE_ZERO);  
+
+  const IndexType *A_row_indices_ptr = row_indices.raw();
+  
+  //All vectors should be initialized to -1.
+  aggregates.fill(-1);
+  Vector<IndexType> strongest_neighbour(n);
+  strongest_neighbour.fill(-1);
+  Vector<IndexType> strongest_neighbour_1phase(n);
+  strongest_neighbour_1phase.fill(-1);
+  Vector<float> edge_weights(nnz);
+  edge_weights.fill(-1);
+  float *edge_weights_ptr  = edge_weights.raw();
+  float *rand_edge_weights_ptr = NULL;
+  cudaCheckError();
+
+  IndexType *strongest_neighbour_ptr = strongest_neighbour.raw();
+  IndexType *strongest_neighbour_1phase_ptr = strongest_neighbour_1phase.raw();
+  IndexType *aggregates_ptr = aggregates.raw();
+
+  const int threads_per_block = 256;
+  const int max_grid_size = 256;
+  const int num_blocks = min( max_grid_size, (n-1)/threads_per_block+ 1 );
+  const int num_blocks_V2 = min( max_grid_size, (nnz-1)/threads_per_block + 1);
+  int bsize = 1; // AmgX legacy: we don't use block CSR matrices, this is just to specify that we run on regular matrices
+
+  int numUnassigned = n;
+  int numUnassigned_previous = numUnassigned;
+  thrust::device_ptr<IndexType> aggregates_thrust_dev_ptr(aggregates_ptr);
+  switch(m_similarity_metric)
+  {
+     case USER_PROVIDED : 
+     {          
+         //printf("user provided !!!!!!!!!!!!!!!! \n");
+         //copy non wero values of A in edge_weights (float)
+         convert_type<<<num_blocks_V2,threads_per_block,0,this->m_stream>>>(nnz, A_nonzero_values_ptr, edge_weights_ptr);
+         cudaCheckError();
+         //edge_weights.dump(0,nnz);
+         break; 
+     }
+     case SCALED_BY_ROW_SUM : 
+     {  /* comment out by Tin-Yin 
+        // Compute the edge weights using .5*(A_ij+A_ji)/max(d(i),d(j)) where d(i) is the sum of outgoing edges of i
+
+        Vector<ValueType> row_sum(n);
+        const ValueType *A_row_sum_ptr = row_sum.raw(); 
+        Vector<ValueType> ones(n);
+        ones.fill(1.0);
+        ValueType alpha = 1.0, beta =0.0;
+        Cusparse::csrmv(false, false, n, n, nnz,&alpha,A_nonzero_values_ptr, A_row_offsets_ptr, A_column_indices_ptr, ones.raw(),&beta, row_sum.raw());
+        cudaFuncSetCacheConfig(computeEdgeWeightsBlockDiaCsr_V2<IndexType,ValueType,float>,cudaFuncCachePreferL1);
+        computeEdgeWeights_simple<<<num_blocks_V2,threads_per_block,0,this->m_stream>>>(A_row_offsets_ptr, A_row_indices_ptr, A_column_indices_ptr, A_row_sum_ptr, A_nonzero_values_ptr, nnz, edge_weights_ptr, rand_edge_weights_ptr, n, this->m_weight_formula);
+        cudaCheckError();  
+        break; 
+*/
+        
+     }
+     case SCALED_BY_DIAGONAL : 
+     { 
+       // Compute the edge weights using AmgX formula (works only if there is a diagonal entry for each row)
+       Vector<IndexType> diag_idx(n);
+       const IndexType *A_dia_idx_ptr = diag_idx.raw();
+
+       computeDiagonalKernelCSR<<<num_blocks,threads_per_block,0,this->m_stream>>>(n, csr_ptr, csr_ind, diag_idx.raw());
+       cudaCheckError();
+
+       cudaFuncSetCacheConfig(computeEdgeWeightsBlockDiaCsr_V2<IndexType,ValueType,float>,cudaFuncCachePreferL1);
+       computeEdgeWeightsBlockDiaCsr_V2<<<num_blocks_V2,threads_per_block,0,this->m_stream>>>(A_row_offsets_ptr, A_row_indices_ptr, A_column_indices_ptr, A_dia_idx_ptr, A_nonzero_values_ptr, nnz, edge_weights_ptr, rand_edge_weights_ptr, n, bsize,this->m_aggregation_edge_weight_component, this->m_weight_formula);
+       cudaCheckError();  
+       break; 
+     }
+     default: return NVGRAPH_ERR_BAD_PARAMETERS;
+  }
+  
+#ifdef EXPERIMENTAL_ITERATIVE_MATCHING
+  // TODO (from amgx): allocate host pinned memory
+  AsyncEvent *throttle_event = new AsyncEvent;
+  throttle_event->create();
+  std::vector<IndexType> h_unagg_vec(1);
+  Vector<IndexType> d_unagg_vec(1);
+
+  int *unaggregated = &h_unagg_vec[0];
+  int *d_unaggregated = d_unagg_vec.raw();
+
+#endif
+
+  int icount, s = 1;
+  {
+    icount = 0;
+    float *weights_ptr = edge_weights_ptr;
+    
+    do 
+    {
+      if( !this->m_two_phase ) {
+      // 1-phase handshaking
+        findStrongestNeighbourBlockDiaCsr_V2<<<num_blocks,threads_per_block,0,this->m_stream>>>(A_row_offsets_ptr, A_column_indices_ptr, weights_ptr, n, aggregates_ptr, strongest_neighbour_ptr, strongest_neighbour_ptr, bsize, 1, this->m_merge_singletons);
+        cudaCheckError();
+
+      } 
+      else { 
+        // 2-phase handshaking
+        findStrongestNeighbourBlockDiaCsr_V2<<<num_blocks,threads_per_block,0,this->m_stream>>>(A_row_offsets_ptr, A_column_indices_ptr, weights_ptr, n, aggregates_ptr, strongest_neighbour_1phase_ptr, strongest_neighbour_ptr, bsize, 1, this->m_merge_singletons);
+        cudaCheckError();
+        
+ 
+        // 2nd phase: for each block_row, find the strongest neighbour among those who gave hand on 1st phase
+        findStrongestNeighbourBlockDiaCsr_V2<<<num_blocks,threads_per_block,0,this->m_stream>>>(A_row_offsets_ptr, A_column_indices_ptr, weights_ptr, n, aggregates_ptr, strongest_neighbour_1phase_ptr, strongest_neighbour_ptr, bsize, 2, this->m_merge_singletons);
+        cudaCheckError();
+      }
+
+      // Look for perfect matches. Also, for nodes without unaggregated neighbours, merge with aggregate containing strongest neighbour
+      matchEdges<<<num_blocks,threads_per_block,0,this->m_stream>>>(n, aggregates_ptr, strongest_neighbour_ptr);
+      cudaCheckError();
+
+#ifdef EXPERIMENTAL_ITERATIVE_MATCHING
+      s = (icount & 1);
+      if( s == 0 ) 
+      {
+        // count unaggregated vertices
+        cudaMemsetAsync(d_unaggregated, 0, sizeof(int), this->m_stream);
+        countAggregates<IndexType,threads_per_block><<<num_blocks,threads_per_block,0,this->m_stream>>>(n, aggregates_ptr, d_unaggregated);
+        cudaCheckError();
+
+        cudaMemcpyAsync(unaggregated, d_unaggregated, sizeof(int), cudaMemcpyDeviceToHost, this->m_stream);
+        throttle_event->record(this->m_stream);
+        cudaCheckError();
+      }
+      else 
+      {
+        throttle_event->sync();
+
+        numUnassigned_previous = numUnassigned;
+        numUnassigned = *unaggregated;
+      }
+#else
+      cudaStreamSynchronize(this->m_stream);
+      numUnassigned_previous = numUnassigned;
+      numUnassigned = (int)thrust::count(aggregates_thrust_dev_ptr, aggregates_thrust_dev_ptr+n,-1);
+      cudaCheckError();
+#endif
+
+      icount++;
+    } while ( (s == 0) || !(numUnassigned==0 || icount > this->m_max_iterations || 1.0*numUnassigned/n < this->m_numUnassigned_tol || numUnassigned == numUnassigned_previous));
+  }
+  
+  //print
+  //printf("icount=%i, numUnassiged=%d, numUnassigned_tol=%f\n", icount, numUnassigned, this->m_numUnassigned_tol);
+
+#ifdef EXPERIMENTAL_ITERATIVE_MATCHING
+  delete throttle_event;
+#endif
+
+  if( this->m_merge_singletons )
+  {
+    // Merge remaining vertices with current aggregates
+    if (!this->m_deterministic)
+    {
+      while (numUnassigned != 0) 
+      {
+        mergeWithExistingAggregatesBlockDiaCsr_V2<<<num_blocks,threads_per_block,0,this->m_stream>>>(A_row_offsets_ptr, A_column_indices_ptr, edge_weights_ptr, n, aggregates_ptr, bsize,this->m_deterministic,(IndexType*) NULL);
+        cudaCheckError();
+
+         numUnassigned = (int)thrust::count(aggregates_thrust_dev_ptr, aggregates_thrust_dev_ptr+n,-1);
+        cudaCheckError();
+      }
+
+    }
+    else 
+    {
+      Vector<int> aggregates_candidate(n);
+      aggregates_candidate.fill(-1);
+
+      while (numUnassigned != 0) 
+      {
+        mergeWithExistingAggregatesBlockDiaCsr_V2<<<num_blocks,threads_per_block,0,this->m_stream>>>(A_row_offsets_ptr, A_column_indices_ptr, edge_weights_ptr, n, aggregates_ptr, bsize,this->m_deterministic,aggregates_candidate.raw());
+        cudaCheckError();
+
+        joinExistingAggregates<<<num_blocks,threads_per_block,0,this->m_stream>>>(n, aggregates_ptr, aggregates_candidate.raw());
+        cudaCheckError();
+
+        numUnassigned = (int)thrust::count(aggregates_thrust_dev_ptr, aggregates_thrust_dev_ptr+n,-1);
+        cudaCheckError();
+      }
+    }
+  }
+  else
+  {
+      //make singletons
+      aggregateSingletons<<<num_blocks,threads_per_block,0,this->m_stream>>>( aggregates_ptr, n );
+      cudaCheckError();
+  }
+
+    renumberAndCountAggregates(aggregates, n, num_aggregates);
+
+    return NVGRAPH_OK; 
+}
+/*
+template <typename IndexType, typename ValueType>
+NVGRAPH_ERROR Size2Selector<IndexType, ValueType>::setAggregates(const CsrGraph<IndexType, ValueType> &A, Vector<IndexType> &aggregates, int &num_aggregates)
+{
+    return setAggregates_common_sqblocks( A, aggregates, num_aggregates);
+}
+*/
+
+template <typename IndexType, typename ValueType>
+NVGRAPH_ERROR Size2Selector<IndexType, ValueType>::setAggregates(
+cusparseHandle_t cusp_handle,
+const IndexType n_vertex,
+const IndexType n_edges, 
+IndexType *csr_ptr,
+IndexType *csr_ind,
+ValueType *csr_val, 
+Vector<IndexType> &aggregates, int &num_aggregates)
+{
+    return setAggregates_common_sqblocks(cusp_handle, n_vertex, n_edges, csr_ptr, csr_ind, csr_val, aggregates, num_aggregates);
+}
+
+//template class Size2Selector<int, float>;
+//template class Size2Selector<int, double>;
+//template void renumberAndCountAggregates  <int> (Vector<int> &aggregates, const int n, int& num_aggregates);
+
diff --git a/cpp/nvgraph/cpp/include/size2_selector.hxx b/cpp/nvgraph/cpp/include/size2_selector.hxx
new file mode 100644
index 00000000000..3e91f0761b7
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/size2_selector.hxx
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+//#include <common_selector.hxx>
+#include <nvgraph_vector.hxx>
+#include <valued_csr_graph.hxx>
+
+namespace nvgraph {
+
+typedef enum
+{
+   USER_PROVIDED = 0, // using edge values as is
+   SCALED_BY_ROW_SUM   = 1,  // 0.5*(A_ij+A_ji)/max(d(i),d (j)), where d(i) is the sum of the row i
+   SCALED_BY_DIAGONAL   = 2,  // 0.5*(A_ij+A_ji)/max(diag(i),diag(j)) 
+}Matching_t;
+
+template <typename IndexType_, typename ValueType_>
+class Size2Selector
+{
+
+  public:
+    typedef IndexType_ IndexType;
+    typedef ValueType_ ValueType;
+
+    Size2Selector();
+
+    Size2Selector(Matching_t similarity_metric,  int deterministic = 1, int max_iterations = 15 , ValueType numUnassigned_tol = 0.05 ,bool two_phase = false, bool merge_singletons = true, cudaStream_t stream = 0) 
+       :m_similarity_metric(similarity_metric), m_deterministic(deterministic), m_max_iterations(max_iterations), m_numUnassigned_tol(numUnassigned_tol), m_two_phase(two_phase), m_merge_singletons(merge_singletons), m_stream(stream)
+    {
+        m_aggregation_edge_weight_component = 0;
+        m_weight_formula = 0;
+    }
+
+    NVGRAPH_ERROR setAggregates(const ValuedCsrGraph<IndexType, ValueType> &A, Vector<IndexType> &aggregates, int &num_aggregates);
+
+  protected:
+    NVGRAPH_ERROR setAggregates_common_sqblocks(const ValuedCsrGraph<IndexType, ValueType> &A, Vector<IndexType> &aggregates, int &num_aggregates);
+    Matching_t m_similarity_metric;
+    int m_deterministic;
+    int m_max_iterations;
+    ValueType m_numUnassigned_tol;
+    bool m_two_phase;
+    bool m_merge_singletons;
+    cudaStream_t m_stream;    
+    int m_aggregation_edge_weight_component;
+    int m_weight_formula;
+};
+
+}//nvgraph
diff --git a/cpp/nvgraph/cpp/include/sm_utils.h b/cpp/nvgraph/cpp/include/sm_utils.h
new file mode 100644
index 00000000000..59ad4c9258e
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/sm_utils.h
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef _MSC_VER
+#include <stdint.h>
+#else
+#include <inttypes.h>
+#endif
+
+#define DEFAULT_MASK 0xffffffff
+
+#define USE_CG 1
+//(__CUDACC_VER__ >= 80500)
+
+
+namespace nvgraph
+{
+namespace utils
+{
+    static __device__ __forceinline__ int lane_id()
+    {
+        int id;
+        asm ( "mov.u32 %0, %%laneid;" : "=r"(id) );
+        return id;
+    }
+
+    static __device__ __forceinline__ int lane_mask_lt()
+    {
+        int mask;
+        asm ( "mov.u32 %0, %%lanemask_lt;" : "=r"(mask) );
+        return mask;
+    }
+
+    static __device__ __forceinline__ int lane_mask_le()
+    {
+        int mask;
+        asm ( "mov.u32 %0, %%lanemask_le;" : "=r"(mask) );
+        return mask;
+    }
+
+    static __device__ __forceinline__ int warp_id()
+    {
+        return threadIdx.x >> 5;
+    }
+
+    static __device__ __forceinline__ unsigned int ballot(int p, int mask = DEFAULT_MASK)
+    {
+    #if __CUDA_ARCH__ >= 300
+#if USE_CG
+        return __ballot_sync(mask, p);
+#else
+        return __ballot(p);   
+#endif
+    #else
+        return 0;
+    #endif
+    }
+
+    static __device__ __forceinline__ int shfl(int r, int lane, int bound = 32, int mask = DEFAULT_MASK)
+    {
+    #if __CUDA_ARCH__ >= 300
+#if USE_CG
+        return __shfl_sync(mask, r, lane, bound );
+#else
+        return __shfl(r, lane, bound );
+#endif
+    #else
+        return 0;
+    #endif
+    }
+
+    static __device__ __forceinline__ float shfl(float r, int lane, int bound = 32, int mask = DEFAULT_MASK)
+    {
+    #if __CUDA_ARCH__ >= 300
+#if USE_CG
+        return __shfl_sync(mask, r, lane, bound );
+#else
+        return __shfl(r, lane, bound );
+#endif
+    #else
+        return 0.0f;
+    #endif
+    }
+
+    /// Warp shuffle down function
+    /** Warp shuffle functions on 64-bit floating point values are not
+    *  natively implemented as of Compute Capability 5.0. This
+    *  implementation has been copied from
+    *  (http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler).
+    *  Once this is natively implemented, this function can be replaced
+    *  by __shfl_down.
+    *
+    */
+    static __device__ __forceinline__ double shfl(double r, int lane, int bound = 32, int mask = DEFAULT_MASK)
+    {
+    #if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+        int2 a = *reinterpret_cast<int2*>(&r);
+        a.x = __shfl_sync(mask, a.x, lane, bound);
+        a.y = __shfl_sync(mask, a.y, lane, bound);
+        return *reinterpret_cast<double*>(&a);
+#else
+        int2 a = *reinterpret_cast<int2*>(&r);
+        a.x = __shfl(a.x, lane, bound);
+        a.y = __shfl(a.y, lane, bound);
+        return *reinterpret_cast<double*>(&a);
+#endif
+    #else
+        return 0.0;
+    #endif
+    }
+
+    static __device__ __forceinline__ long long shfl(long long r, int lane, int bound = 32, int mask = DEFAULT_MASK)
+    {
+    #if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+        int2 a = *reinterpret_cast<int2*>(&r);
+        a.x = __shfl_sync(mask, a.x, lane, bound);
+        a.y = __shfl_sync(mask, a.y, lane, bound);
+        return *reinterpret_cast<long long*>(&a);
+#else
+        int2 a = *reinterpret_cast<int2*>(&r);
+        a.x = __shfl(a.x, lane, bound);
+        a.y = __shfl(a.y, lane, bound);
+        return *reinterpret_cast<long long*>(&a);
+#endif
+    #else
+        return 0.0;
+    #endif
+    }
+
+    static __device__ __forceinline__ int shfl_down(int r, int offset, int bound = 32, int mask = DEFAULT_MASK)
+    {
+    #if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+        return __shfl_down_sync( mask, r, offset, bound );
+#else
+        return __shfl_down( r, offset, bound );
+#endif
+    #else
+        return 0.0f;
+    #endif
+    }
+
+    static __device__ __forceinline__ float shfl_down(float r, int offset, int bound = 32, int mask = DEFAULT_MASK)
+    {
+    #if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+        return __shfl_down_sync( mask, r, offset, bound );
+#else
+        return __shfl_down( r, offset, bound );
+#endif
+    #else
+        return 0.0f;
+    #endif
+    }
+
+    static __device__ __forceinline__ double shfl_down(double r, int offset, int bound = 32, int mask = DEFAULT_MASK)
+    {
+    #if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+        int2 a = *reinterpret_cast<int2*>(&r);
+        a.x = __shfl_down_sync(mask, a.x, offset, bound);
+        a.y = __shfl_down_sync(mask, a.y, offset, bound);
+        return *reinterpret_cast<double*>(&a);
+#else
+        int2 a = *reinterpret_cast<int2*>(&r);
+        a.x = __shfl_down(a.x, offset, bound);
+        a.y = __shfl_down(a.y, offset, bound);
+        return *reinterpret_cast<double*>(&a);
+#endif
+    #else
+        return 0.0;
+    #endif
+    }
+
+    static __device__ __forceinline__ long long shfl_down(long long r, int offset, int bound = 32, int mask = DEFAULT_MASK)
+    {
+    #if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+        int2 a = *reinterpret_cast<int2*>(&r);
+        a.x = __shfl_down_sync(mask, a.x, offset, bound);
+        a.y = __shfl_down_sync(mask, a.y, offset, bound);
+        return *reinterpret_cast<long long*>(&a);
+#else
+        int2 a = *reinterpret_cast<int2*>(&r);
+        a.x = __shfl_down(a.x, offset, bound);
+        a.y = __shfl_down(a.y, offset, bound);
+        return *reinterpret_cast<long long*>(&a);
+#endif
+    #else
+        return 0.0;
+    #endif
+    }
+
+    // specifically for triangles counting
+    static __device__ __forceinline__ uint64_t shfl_down(uint64_t r, int offset, int bound = 32, int mask = DEFAULT_MASK)
+    {
+    #if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+        int2 a = *reinterpret_cast<int2*>(&r);
+        a.x = __shfl_down_sync(mask, a.x, offset, bound);
+        a.y = __shfl_down_sync(mask, a.y, offset, bound);
+        return *reinterpret_cast<uint64_t*>(&a);
+#else
+        int2 a = *reinterpret_cast<int2*>(&r);
+        a.x = __shfl_down(mask, a.x, offset, bound);
+        a.y = __shfl_down(mask, a.y, offset, bound);
+        return *reinterpret_cast<uint64_t*>(&a);
+#endif
+    #else
+        return 0.0;
+    #endif
+    }
+
+    static __device__ __forceinline__ int shfl_up(int r, int offset, int bound = 32, int mask = DEFAULT_MASK)
+    {
+    #if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+        return __shfl_up_sync( mask, r, offset, bound );
+#else
+        return __shfl_up( r, offset, bound );
+#endif
+    #else
+        return 0.0f;
+    #endif
+    }
+
+    static __device__ __forceinline__ float shfl_up(float r, int offset, int bound = 32, int mask = DEFAULT_MASK)
+    {
+    #if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+        return __shfl_up_sync( mask, r, offset, bound );
+#else
+        return __shfl_up( r, offset, bound );
+#endif
+    #else
+        return 0.0f;
+    #endif
+    }
+
+    static __device__ __forceinline__ double shfl_up(double r, int offset, int bound = 32, int mask = DEFAULT_MASK)
+    {
+    #if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+        int2 a = *reinterpret_cast<int2*>(&r);
+        a.x = __shfl_up_sync(mask, a.x, offset, bound);
+        a.y = __shfl_up_sync(mask, a.y, offset, bound);
+        return *reinterpret_cast<double*>(&a);
+#else
+        int2 a = *reinterpret_cast<int2*>(&r);
+        a.x = __shfl_up(a.x, offset, bound);
+        a.y = __shfl_up(a.y, offset, bound);
+        return *reinterpret_cast<double*>(&a);
+#endif
+    #else
+        return 0.0;
+    #endif
+    }
+
+    static __device__ __forceinline__ long long shfl_up(long long r, int offset, int bound = 32, int mask = DEFAULT_MASK)
+    {
+    #if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+        int2 a = *reinterpret_cast<int2*>(&r);
+        a.x = __shfl_up_sync(mask, a.x, offset, bound);
+        a.y = __shfl_up_sync(mask, a.y, offset, bound);
+        return *reinterpret_cast<long long*>(&a);
+#else
+        int2 a = *reinterpret_cast<int2*>(&r);
+        a.x = __shfl_up(a.x, offset, bound);
+        a.y = __shfl_up(a.y, offset, bound);
+        return *reinterpret_cast<long long*>(&a);
+#endif
+    #else
+        return 0.0;
+    #endif
+    }
+}
+
+}
diff --git a/cpp/nvgraph/cpp/include/sssp.hxx b/cpp/nvgraph/cpp/include/sssp.hxx
new file mode 100644
index 00000000000..fe8fda4606b
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/sssp.hxx
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+#pragma once
+#include <climits> 
+namespace nvgraph
+{
+template <typename IndexType_, typename ValueType_>
+class Sssp 
+{
+public: 
+    typedef IndexType_ IndexType;
+    typedef ValueType_ ValueType;
+
+private:
+    ValuedCsrGraph <IndexType, ValueType> m_network ;
+    Vector <ValueType> m_sssp;
+    Vector <ValueType> m_tmp;
+    Vector <int> m_mask; // mask[i] = 0 if we can ignore the i th column in the csrmv
+
+    IndexType m_source;
+    ValueType m_residual;
+    int m_iterations;
+    bool m_is_setup;
+
+    cudaStream_t m_stream;
+
+    bool solve_it();
+    void setup(IndexType source_index, Vector<ValueType>& source_connection,  Vector<ValueType>& sssp_result);
+
+public:
+    // Simple constructor 
+    Sssp(void) {};
+    // Simple destructor
+    ~Sssp(void) {};
+
+    // Create a Sssp solver attached to a the transposed of a  weighted network
+    // *** network is the transposed/CSC*** 
+    Sssp(const ValuedCsrGraph <IndexType, ValueType>& network, cudaStream_t stream = 0):m_network(network),m_is_setup(false), m_stream(stream)  {};
+   
+    /*! Find the sortest path from  the vertex source_index to every other vertices.
+     *
+     *  \param source_index The source. 
+     *  \param source_connection The connectivity of the source
+     *                                                  if there is a link from source_index to i, source_connection[i] = E(source_index, i) 
+     *                                                  otherwise  source_connection[i] = inifinity 
+     *                                                  source_connection[source_index] = 0
+                                                         The source_connection is computed somewhere else.
+     *  \param (output) m_sssp  m_sssp[i] contains the sortest path from  the source to the vertex i.
+     */
+     
+    NVGRAPH_ERROR solve(IndexType source_index, Vector<ValueType>& source_connection, Vector<ValueType>& sssp_result);
+    inline int get_iterations() const {return m_iterations;}
+};
+
+} // end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/include/stacktrace.h b/cpp/nvgraph/cpp/include/stacktrace.h
new file mode 100644
index 00000000000..1f3b6f2b83b
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/stacktrace.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//adapted from https://idlebox.net/2008/0901-stacktrace-demangled/ and licensed under WTFPL v2.0
+#pragma once
+
+#if defined(_WIN32) || defined (__ANDROID__) || defined(ANDROID) || defined (__QNX__) || defined (__QNXNTO__)
+#else
+ #include <execinfo.h>
+ #include <dlfcn.h>
+ #include <cxxabi.h>
+ #include <unistd.h>
+ #include <stdlib.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <sstream>
+#include <iostream>
+namespace nvgraph {
+
+/** Print a demangled stack backtrace of the caller function to FILE* out. */
+static inline void printStackTrace(std::ostream &eout = std::cerr, unsigned int max_frames = 63)
+{
+#if defined(_WIN32) || defined (__ANDROID__) || defined(ANDROID) || defined (__QNX__) || defined (__QNXNTO__)
+  //TODO add code for windows stack trace and android stack trace
+#else
+    std::stringstream out;
+
+    // storage array for stack trace address data
+    void* addrlist[max_frames+1];
+
+    // retrieve current stack addresses
+    int addrlen = backtrace(addrlist, sizeof(addrlist) / sizeof(void*));
+
+    if (addrlen == 0) {
+        out << "  <empty, possibly corrupt>\n";
+        return;
+    }
+
+    // resolve addresses into strings containing "filename(function+address)",
+    // this array must be free()-ed
+    char** symbollist = backtrace_symbols(addrlist, addrlen);
+
+    // allocate string which will be filled with the demangled function name
+    size_t funcnamesize = 256;
+    char* funcname = (char*)malloc(funcnamesize);
+
+    // iterate over the returned symbol lines. skip the first, it is the
+    // address of this function.
+    for (int i = 1; i < addrlen; i++)
+    {
+        char *begin_name = 0, *begin_offset = 0, *end_offset = 0;
+
+        // find parentheses and +address offset surrounding the mangled name:
+        // ./module(function+0x15c) [0x8048a6d]
+        for (char *p = symbollist[i]; *p; ++p)
+        { 
+            if (*p == '(')
+                begin_name = p;   
+            else if (*p == '+')
+                begin_offset = p;
+            else if (*p == ')' && begin_offset) {
+                end_offset = p;
+                break;
+            }
+        }
+
+        if (begin_name && begin_offset && end_offset
+            && begin_name < begin_offset)
+        {
+            *begin_name++ = '\0';
+            *begin_offset++ = '\0';
+            *end_offset = '\0';
+
+            // mangled name is now in [begin_name, begin_offset) and caller
+            // offset in [begin_offset, end_offset). now apply
+            // __cxa_demangle():
+
+            int status;
+            char* ret = abi::__cxa_demangle(begin_name,
+                                            funcname, &funcnamesize, &status);
+            if (status == 0) {
+                funcname = ret; // use possibly realloc()-ed string
+                out << " " << symbollist[i] << " : " << funcname << "+" << begin_offset << "\n";
+            }
+            else {
+                // demangling failed. Output function name as a C function with
+                // no arguments.
+                out << " " << symbollist[i] << " : " << begin_name << "()+" << begin_offset << "\n";
+            }
+        }
+        else
+        {
+            // couldn't parse the line? print the whole line.
+            out << " " << symbollist[i] << "\n";
+        }
+    }
+    eout << out.str();
+    //error_output(out.str().c_str(),out.str().size());
+    free(funcname);
+    free(symbollist);
+    //printf("PID of failing process: %d\n",getpid());
+    //while(1);
+#endif
+}
+
+} //end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/include/subg_extrctrs.hxx b/cpp/nvgraph/cpp/include/subg_extrctrs.hxx
new file mode 100644
index 00000000000..60bff6417bd
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/subg_extrctrs.hxx
@@ -0,0 +1,319 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+#include <iterator>
+#include <algorithm>
+#include <sstream>
+#include <cassert>
+
+namespace nvgraph{
+namespace debug{
+
+//Sequential CSR graph extractor
+//for DEBUGGING purposes, only
+//
+template<typename VectorI,
+	 typename VectorV,
+	 typename VectorB = VectorI>
+struct SeqSubGraphExtractorFunctor
+{
+  typedef typename VectorI::value_type IndexT;
+  typedef typename VectorV::value_type ValueT;
+  typedef typename VectorB::value_type ValueB;
+
+  explicit SeqSubGraphExtractorFunctor(const VectorI& vSubset):
+    vertexSubset(vSubset)
+  {
+	//make sure vertexSubset_ is sorted increasingly:
+	///sort_ifnot(vertexSubset);
+  }
+    
+  virtual ~SeqSubGraphExtractorFunctor(void)
+  {
+  }
+    
+  const VectorV& get_vals(void) const
+  {
+	return vals_subg;
+  }
+
+  VectorV& get_vals(void)
+  {
+	return vals_subg;
+  }
+    
+  const VectorI& get_row_ptr(void) const
+  {
+	return row_ptr_subg;
+  }
+    
+  const VectorI& get_col_ind(void) const
+  {
+	return col_ind_subg;
+  }
+
+  struct ValueUpdater
+  {
+	ValueUpdater(const VectorV& v_src,
+				 VectorV& v_dest):
+	  v_s_(v_src),
+	  v_d_(v_dest)
+	{
+	}
+
+	//debug: (sequential version only)
+	void operator() (const IndexT& j)
+	{
+	  v_d_.push_back(v_s_[j]);
+	}
+            
+	ValueT at(IndexT j) const
+	{
+	  return v_s_[j];
+	}
+      
+	void update_vals(const VectorV& vals)
+	{
+	  v_d_ = vals;
+	}
+  private:
+	const VectorV& v_s_;
+	VectorV& v_d_;
+  };
+
+  struct NoValueUpdater
+  {
+	void operator() (const IndexT& j)
+	{
+	  //no-op...
+	}
+
+	ValueT at(IndexT j) const
+	{
+	  return ValueT(0); //nothing meaningful...
+	}
+
+	void update_vals(const VectorV& vals)
+	{
+	  //no-op...
+	}
+  };
+    
+  virtual void operator () (VectorI& row_ptr_,
+							VectorI& col_ind_)
+  {
+	NoValueUpdater fctr;
+	sequential_extract_subgraph(row_ptr_, col_ind_, fctr);
+  }
+    
+  virtual void operator () (VectorV& vals_,
+							VectorI& row_ptr_,
+							VectorI& col_ind_)
+  {
+	ValueUpdater fctr(vals_, vals_subg);
+	sequential_extract_subgraph(row_ptr_, col_ind_, fctr);
+  }
+    
+protected:
+    
+  //for debugging purposes, only:
+  //
+  template<typename ValUpdaterFctr>
+  void sequential_extract_subgraph(const VectorI& row_ptr_,
+								   const VectorI& col_ind_,
+								   ValUpdaterFctr& fctr)
+  {
+	VectorI all_zeros;
+        
+	IndexT last_updated_pos(0);
+	//
+	size_t nrows_subg = vertexSubset.size();
+        
+	VectorB hash_rows;
+	size_t hash_sz = make_hash(vertexSubset, hash_rows);//assume *NOT* sorted
+        
+	row_ptr_subg.assign(nrows_subg+1, IndexT(0));
+	all_zeros.reserve(nrows_subg);
+        
+	IndexT nz_subg(0);
+        
+	//this loop assumes sorted vertexSubset
+	//
+	for(IndexT i=IndexT(0);i<IndexT(nrows_subg);++i)
+	  {
+		IndexT row_index = vertexSubset[i];
+		bool first_nz_inrow = true;
+            
+		for(IndexT j=row_ptr_[row_index]; j<row_ptr_[row_index+1];++j)
+		  {
+			IndexT k = col_ind_[j];
+			if( (k<hash_sz) && (hash_rows[k] == 1) )//in vertex subset!
+			  ///if( std::binary_search(vertexSubset.begin(), vertexSubset.end(), k) )//in vertex subset!
+			  {
+				///vals_subg.push_back(vals_[j]);//functor! (no-op vs push_back())
+				fctr(j);//synch issues for parallel!
+                    
+				col_ind_subg.push_back(k);//synch issues for parallel!
+                    
+				++nz_subg;
+                    
+				//synch issues for parallel:
+				//
+				if( first_nz_inrow )//update row_ptr_subg
+				  {
+					row_ptr_subg[i] = last_updated_pos;
+					first_nz_inrow = false;
+				  }
+                    
+				++last_updated_pos;//synch issues for parallel!
+			  }
+		  }//end for(j;..)
+            
+		//special cases of a row with all zeros: mark it!
+		if (first_nz_inrow)
+		  {
+			all_zeros.push_back(i);
+		  }
+	  }//end for(i;...)
+        
+	assert( nz_subg == col_ind_subg.size() );
+        
+	//last entry in row_ptr_subg:
+	row_ptr_subg.back() = nz_subg;
+        
+	//handle all zero row cases:
+	fix_zero_rows(all_zeros, row_ptr_subg);
+        
+	//assume *NOT* sorted
+	remap_indices(vertexSubset, col_ind_subg);
+  }
+
+  struct UpdateRowPtr
+  {
+	explicit UpdateRowPtr(VectorI& row_p): row_p_(row_p)
+	{
+	}
+            
+	void operator() (const IndexT& i)
+	{
+	  row_p_[i] = row_p_[i+1];
+	}
+  private:
+	VectorI& row_p_;
+  };
+    
+  //correct row_ptr: iterate all_zeros from end towards beginning
+  //and correct row_ptr_ at corresponding index
+  //
+  static void fix_zero_rows(const VectorI& all_zeros,
+							VectorI& row_ptr)
+  {    
+	UpdateRowPtr correcter(row_ptr);
+        
+	//reverse traversal!
+	//
+	std::for_each(all_zeros.rbegin(), all_zeros.rend(), correcter);
+  }
+   
+  template<typename Container>
+  struct HashFctr
+  {
+	explicit HashFctr(Container& hash_src):
+	  m_hash(hash_src)
+	{
+	}
+	IndexT operator() (const IndexT& src_elem)
+	{
+	  IndexT hit(1);
+	  m_hash[src_elem] = hit;
+	  return hit;
+	}
+  private:
+	Container& m_hash;
+  };
+
+  static size_t make_hash(const VectorI& src,
+						  VectorB& hash_src,
+						  bool is_sorted = false)
+  {
+	assert( !src.empty() );
+        
+	IndexT max_entry(0);
+	if( is_sorted )
+	  max_entry = src.back();
+	else
+	  max_entry = *std::max_element(src.begin(), src.end());
+        
+	hash_src.assign(max_entry+1, 0);
+	VectorB dummy(hash_src);	
+        
+	HashFctr<VectorB> hctr(hash_src);
+
+	//why unused dummy? 
+	//because functor must return something  
+	//and must store result of functor somewhere!
+	//
+	std::transform(src.begin(), src.end(), 
+				   dummy.begin(), //unused...
+				   hctr);
+
+	return hash_src.size();
+  }
+    
+  //re-number vertices:
+  //
+  static void remap_indices(const VectorI& src,
+							VectorI& index_set,
+							bool is_sorted = false)
+  {
+	IndexT max_entry(0);
+	if( is_sorted )
+	  max_entry = src.back();
+	else
+	  max_entry = *std::max_element(src.begin(), src.end());
+        
+	//use hash_src vector as hash-table:
+	//
+	VectorI hash_src(max_entry+1, IndexT(0));
+        
+	IndexT counter(0);
+	for(typename VectorI::const_iterator pos = src.begin();
+		pos != src.end();
+		++pos)
+	  {
+		hash_src[*pos]=counter++;//SEQUENTIALITY!!!
+	  }
+        
+	IndexT set_sz(index_set.size());
+	VectorI old_index_set(index_set);
+        
+	for(IndexT k = IndexT(0);k<set_sz;++k)
+	  {
+		index_set[k] = hash_src[old_index_set[k]];
+	  }
+  }
+
+private:
+  VectorI vertexSubset;
+    
+  VectorV vals_subg;     //not used for non-valued graphs
+  VectorI row_ptr_subg;
+  VectorI col_ind_subg;
+};
+
+}//end namespace debug
+}//end namespace nvgraph
diff --git a/cpp/nvgraph/cpp/include/test/cluster_inv.cuh b/cpp/nvgraph/cpp/include/test/cluster_inv.cuh
new file mode 100644
index 00000000000..a7b37d9c794
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/test/cluster_inv.cuh
@@ -0,0 +1,211 @@
+
+
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <string>
+#include <vector>
+#include "test_opt_utils.h"
+#include "graph_utils.cuh"
+#include "louvain.cuh"
+#include "gtest/gtest.h"
+#include "high_res_clock.h"
+#include "util.cuh"
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+
+
+
+void cluster_inv_test(){
+  std::vector<int> cluster = {0, 1, 1, 2, 1, 0, 2, 2, 3, 4, 5, 6, 4, 6, 5, 3};
+  int n_vertex = 16;
+  int c_size = 7;
+  thrust::device_vector<int> cluster_d(cluster.begin(), cluster.end());
+  thrust::device_vector<int> cluster_inv_ptr(c_size + 1);
+  thrust::device_vector<int> cluster_inv_ind(n_vertex);
+  int* cluster_inv_ptr_ptr = thrust::raw_pointer_cast(cluster_inv_ptr.data());
+  int* cluster_inv_ind_ptr = thrust::raw_pointer_cast(cluster_inv_ind.data());
+  thrust::device_vector<int> seq_idx(n_vertex);
+  thrust::sequence(seq_idx.begin(), seq_idx.end());
+  int* seq_idx_ptr =  thrust::raw_pointer_cast(seq_idx.data());
+
+  dim3 block_size((n_vertex + BLOCK_SIZE_1D -1)/ BLOCK_SIZE_1D, 1, 1);
+  dim3 grid_size(BLOCK_SIZE_1D, 1, 1); 
+
+
+  nvlouvain::generate_cluster_inv(n_vertex, c_size, cluster_d.begin(), cluster_inv_ptr, cluster_inv_ind);
+  #ifdef VERBOSE
+    nvlouvain::display_vec(cluster_inv_ptr);
+    nvlouvain::display_vec(cluster_inv_ind);
+  #endif
+//  nvlouvain::display_vec_size(cluster_inv_ind_ptr, n_vertex);
+
+}
+
+
+void cluster_sum_test(thrust::device_vector<int> &csr_ptr_d,
+                     thrust::device_vector<int> &csr_ind_d,
+                     thrust::device_vector<T> &csr_val_d,
+                     const int n_vertex,
+                     bool weighted){
+
+  HighResClock hr_clock;
+  double timed, diff_time;
+  std::vector<int> cluster(n_vertex);
+  int c_size;
+
+  if(n_vertex == 16){ 
+    cluster = {0, 1, 1, 2, 1, 0, 2, 2, 3, 4, 5, 6, 4, 6, 5, 3};
+    c_size = 7;
+  }
+  else{
+    for(int i = 0 ; i <n_vertex ; ++i){
+      cluster[i]=i;
+    }
+    c_size = n_vertex;
+  }
+  thrust::device_vector<int> cluster_d(cluster.begin(), cluster.end());
+  thrust::device_vector<int> cluster_inv_ptr(c_size+1);
+  thrust::device_vector<int> cluster_inv_ind(n_vertex);
+  int* cluster_inv_ptr_ptr = thrust::raw_pointer_cast(cluster_inv_ptr.data());
+  int* cluster_inv_ind_ptr = thrust::raw_pointer_cast(cluster_inv_ind.data());
+  thrust::device_vector<int> seq_idx(n_vertex);
+  thrust::sequence(seq_idx.begin(), seq_idx.end());
+  int* seq_idx_ptr =  thrust::raw_pointer_cast(seq_idx.data());
+
+  dim3 block_size((n_vertex + BLOCK_SIZE_1D -1)/ BLOCK_SIZE_1D, 1, 1);
+  dim3 grid_size(BLOCK_SIZE_1D, 1, 1); 
+
+
+
+  thrust::device_vector<T> score(1);
+  thrust::device_vector<T> k_vec(n_vertex);
+  thrust::device_vector<T> Q_arr(n_vertex);
+  thrust::device_vector<T> delta_Q_arr(csr_ptr_d[n_vertex]);
+  thrust::device_vector<T> cluster_sum_vec(c_size);
+  
+
+  T* score_ptr = thrust::raw_pointer_cast(score.data());
+  T* k_vec_ptr = thrust::raw_pointer_cast(k_vec.data());
+  T* Q_arr_ptr = thrust::raw_pointer_cast(Q_arr.data());
+  T* cluster_sum_vec_ptr = thrust::raw_pointer_cast(cluster_sum_vec.data());
+  T* delta_Q_arr_ptr =  thrust::raw_pointer_cast(delta_Q_arr.data());
+  int* csr_ptr_ptr = thrust::raw_pointer_cast(csr_ptr_d.data());
+  int* csr_ind_ptr = thrust::raw_pointer_cast(csr_ind_d.data());
+  T* csr_val_ptr = thrust::raw_pointer_cast(csr_val_d.data());
+  int* cluster_ptr = thrust::raw_pointer_cast(cluster_d.data());
+
+
+
+  hr_clock.start();
+  nvlouvain::generate_cluster_inv(n_vertex, c_size, cluster_d.begin(), cluster_inv_ptr, cluster_inv_ind);
+  hr_clock.stop(&timed);
+  diff_time = timed;
+
+  weighted = true;
+
+  #ifdef VERBOSE
+    printf("cluster inv: \n");
+    nvlouvain::display_vec(cluster_inv_ptr);
+    nvlouvain::display_vec(cluster_inv_ind);
+  #endif
+  std::cout<<"cluster inv rumtime: "<<diff_time<<" us\n";
+  T m2 = thrust::reduce(thrust::cuda::par, csr_val_d.begin(), csr_val_d.end());
+
+  hr_clock.start();
+  double Q = nvlouvain::modularity(n_vertex, csr_ptr_d[n_vertex],c_size, m2, 
+                        csr_ptr_ptr, csr_ind_ptr, csr_val_ptr, 
+                        cluster_ptr, cluster_inv_ptr_ptr, cluster_inv_ind_ptr, 
+                        weighted, k_vec_ptr, Q_arr_ptr, delta_Q_arr_ptr);
+
+/*
+  nvlouvain::kernel_modularity_no_matrix<<<block_size, grid_size >>>(n_vertex, c_size, m2,
+                                                                     csr_ptr_d.begin(), csr_ind_d.begin(), csr_val_d.begin(), 
+                                                                     cluster_d.begin(), cluster_inv_ptr.begin(), cluster_inv_ind.begin(),
+                                                                     weighted, k_vec_ptr, Q_arr_ptr, delta_Q_arr_ptr,score_ptr);
+ 
+  CUDA_CALL(cudaDeviceSynchronize());
+
+  double Q = score[0];
+*/
+  hr_clock.stop(&timed);
+  diff_time = timed;
+
+  #ifdef VERBOSE
+    printf("Q_arr: \n");
+    nvlouvain::display_vec(Q_arr);
+    printf("k_vec: \n");
+    nvlouvain::display_vec(k_vec);
+  #endif
+  printf("modularity(w/o block): %.10e  runtime: ",Q);
+  std::cout<<diff_time<<std::endl;
+
+  //====================  
+
+  int side = (n_vertex + BLOCK_SIZE_1D -1)/ BLOCK_SIZE_1D;
+  dim3 block_size_2d(side,side,1);
+  dim3 grid_size_2d(BLOCK_SIZE_1D, BLOCK_SIZE_1D, 1);
+ 
+ 
+  hr_clock.start();
+  nvlouvain::build_delta_modularity_vec<<<block_size_2d, grid_size_2d>>>(n_vertex, 
+                                                                         csr_ptr_d.begin(), csr_ind_d.begin(), csr_val_d.begin(), 
+                                                                         cluster_d.begin(), delta_Q_arr_ptr);
+  CUDA_CALL(cudaDeviceSynchronize());
+  hr_clock.stop(&timed);
+  diff_time = timed;
+  #ifdef VERBOSE
+    nvlouvain::display_vec(Q_arr);
+  #endif
+  std::cout<<"delta (w block) rumtime: "<<diff_time<<" us\n";
+  
+
+
+
+
+
+  //====================
+
+/*
+  hr_clock.start();
+  nvlouvain::kernel_compute_cluster_sum<<<block_size, grid_size>>>( n_vertex, c_size, 
+                                                                      cluster_inv_ptr_ptr, cluster_inv_ind_ptr, 
+                                                                      k_vec_ptr, cluster_sum_vec_ptr);
+  CUDA_CALL(cudaDeviceSynchronize());
+  #ifdef VERBOSE
+    nvlouvain::display_vec(cluster_sum_vec);
+  #endif
+  nvlouvain::build_delta_modularity_vec<<<block_size_2d, grid_size_2d>>>(n_vertex, c_size, m2 
+                                                                         csr_ptr_d.begin(), csr_ind_d.begin(), csr_val_d.begin(), 
+                                                                         cluster_d.begin(),
+                                                                         cluster_sum_vec_ptr, 
+                                                                         k_vec_ptr, delta_Q_arr_ptr);
+  CUDA_CALL(cudaDeviceSynchronize());
+  hr_clock.stop(&timed);
+  diff_time = timed;
+  #ifdef VERBOSE
+    nvlouvain::display_vec(Q_arr);
+  #endif
+
+  std::cout<<"delta (wo block)rumtime: "<<diff_time<<" us\n";
+*/
+ 
+} 
diff --git a/cpp/nvgraph/cpp/include/test/delta_modularity_test.cuh b/cpp/nvgraph/cpp/include/test/delta_modularity_test.cuh
new file mode 100644
index 00000000000..9978c15be06
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/test/delta_modularity_test.cuh
@@ -0,0 +1,86 @@
+
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <string>
+#include "test_opt_utils.h"
+#include "graph_utils.cuh"
+#include "louvain.cuh"
+#include "gtest/gtest.h"
+#include "high_res_clock.h"
+#include "util.cuh"
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+
+
+template<typename IdxIter, typename ValIter, typename ValType>
+__global__ void 
+kernel_delta_modularity(const int n_vertex, IdxIter csr_ptr_iter, IdxIter csr_ind_iter, ValIter csr_val_iter, IdxIter cluster, ValType* score){
+
+  int c = blockIdx.x * blockDim.x + threadIdx.x;
+  int i = blockIdx.y * blockDim.y + threadIdx.y;
+  if( i<n_vertex && c < n_vertex ){
+    nvlouvain::delta_modularity_block( n_vertex, csr_ptr_iter, csr_ind_iter, csr_val_iter, cluster, i, c, &score[i*n_vertex +c] );
+    //printf("i: %d c: %d delta: %f\n", i, c, score[i*n_vertex +c] );
+  }
+
+}
+
+
+void delta_modularity_test(thrust::device_vector<int> &csr_ptr_d,
+                     thrust::device_vector<int> &csr_ind_d,
+                     thrust::device_vector<T> &csr_val_d,
+                     const int size){
+
+  HighResClock hr_clock;
+  double timed;
+  
+  dim3 block_size((size + BLOCK_SIZE_2D -1)/ BLOCK_SIZE_2D, (size + BLOCK_SIZE_2D -1)/ BLOCK_SIZE_2D, 1);
+  dim3 grid_size(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1); 
+
+ 
+  thrust::device_vector<int> cluster_d(size);
+  thrust::sequence(cluster_d.begin(), cluster_d.end());  
+  std::cout<<"cluster: ";
+  nvlouvain::display_vec(cluster_d);
+
+  thrust::device_vector<T> score_d(size*size);
+  T* score_d_raw_ptr = thrust::raw_pointer_cast(score_d.data());
+
+
+  hr_clock.start();
+
+  kernel_delta_modularity<<<block_size, grid_size>>>(size, csr_ptr_d.begin(), csr_ind_d.begin(), csr_val_d.begin(), cluster_d.begin(), score_d_raw_ptr);
+
+ 
+  CUDA_CALL(cudaDeviceSynchronize());
+
+  hr_clock.stop(&timed);
+  double mod_time(timed);
+  std::cout<<"delta modularity: "<<score_d[0]<<" runtime: "<<mod_time<<std::endl;
+
+
+  
+}
+
+
+
+
diff --git a/cpp/nvgraph/cpp/include/test/k_compute_test.cuh b/cpp/nvgraph/cpp/include/test/k_compute_test.cuh
new file mode 100644
index 00000000000..32342c1e118
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/test/k_compute_test.cuh
@@ -0,0 +1,133 @@
+
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <string>
+#include "test_opt_utils.h"
+#include "graph_utils.cuh"
+#include "louvain.cuh"
+#include "gtest/gtest.h"
+#include "high_res_clock.h"
+#include "util.cuh"
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+
+/*
+template< typename IdxType, typename ValType >
+__global__ void kernal_test(const int size, IdxType* csr_ptr, ValType* csr_val, int i, ValType* result){
+  int idx = blockDim.x*blockIdx.x + threadIdx.x;
+  if(idx < size){
+    nvlouvain::compute_k(size, csr_ptr, csr_val, idx, &result[idx]);
+    //printf("k%d = %f\n", idx ,result[idx]);
+    
+  }
+  return;
+
+}
+
+template< typename IdxIter, typename ValIter, typename ValType >
+__global__ void kernal_test_iter(const int size, IdxIter csr_ptr_iter, ValIter csr_val_iter, int i, ValType* result){
+
+  int idx = blockDim.x*blockIdx.x + threadIdx.x;
+  if(idx < size){
+
+    //printf("start compute k with iter passing. (%d, %d, %d) idx = %d %f\n", blockDim.x, blockIdx.x, threadIdx.x, idx, result[idx]);
+    nvlouvain::compute_k(size, csr_ptr_iter, csr_val_iter, idx, &result[idx]);
+
+    //printf("k%d = %f\n", idx ,result[idx]);
+    
+  }
+  return;
+
+}
+
+
+template< typename IdxIter, typename ValIter, typename DevPtr >
+__global__ void kernal_test_dev_ptr(const int size, IdxIter csr_ptr_iter, ValIter csr_val_iter, int i, DevPtr result){
+
+  int idx = blockDim.x*blockIdx.x + threadIdx.x;
+  if(idx < size){
+    //printf("start compute k with iter passing. (%d, %d, %d) idx = %d %f\n", blockDim.x, blockIdx.x, threadIdx.x, idx, result[idx]);
+    nvlouvain::compute_k(size, csr_ptr_iter, csr_val_iter, idx, &result[idx]);
+    //printf("k%d = %f\n", idx ,result[idx]);
+  }
+  return;
+
+}
+
+
+
+void k_compute_test( thrust::device_vector<int> &csr_ptr_d,
+                     thrust::device_vector<int> &csr_ind_d,
+                     thrust::device_vector<T> &csr_val_d,
+                     int size){
+
+  HighResClock hr_clock;
+  double timed;
+
+  
+  dim3 block_size((size + BLOCK_SIZE_1D -1)/ BLOCK_SIZE_1D, 1, 1);
+  dim3 grid_size(BLOCK_SIZE_1D, 1, 1);
+
+  
+  std::cout<< csr_ptr_d.size()<<" "<<csr_val_d.size()<<" size:"<< size <<std::endl;
+
+  int* csr_ptr_d_raw_ptr = thrust::raw_pointer_cast(csr_ptr_d.data()); 
+  T* csr_val_d_raw_ptr = thrust::raw_pointer_cast(csr_val_d.data());
+
+  thrust::device_vector<T> k_d(size);
+  T* k_d_raw_cast_ptr = thrust::raw_pointer_cast(k_d.data());  
+
+  hr_clock.start();
+  kernal_test<<<block_size,grid_size>>>(size , csr_ptr_d_raw_ptr, csr_val_d_raw_ptr, 0, k_d_raw_cast_ptr);
+  CUDA_CALL(cudaDeviceSynchronize());
+//  nvlouvain::display_vec(k_d);
+  hr_clock.stop(&timed); 
+  double raw_ptr_time(timed);  
+
+
+
+  thrust::device_vector<T> k_iter_d(size);
+  T* k_iter_d_raw_ptr = thrust::raw_pointer_cast(k_iter_d.data());
+  hr_clock.start();
+  kernal_test_iter<<<block_size, grid_size>>>(size, csr_ptr_d.begin(), csr_val_d.begin(), 0, k_iter_d_raw_ptr);
+  CUDA_CALL(cudaDeviceSynchronize());
+  hr_clock.stop(&timed);
+  double iter_time(timed);
+//  nvlouvain::display_vec(k_iter_d);
+
+
+  thrust::device_vector<T> k_d_ptr_d(size);
+  hr_clock.start();
+  kernal_test_dev_ptr<<<block_size, grid_size>>>(size, csr_ptr_d.begin(), csr_val_d.begin(), 0, k_d_ptr_d.data());
+  CUDA_CALL(cudaDeviceSynchronize());
+  hr_clock.stop(&timed);
+  double dev_ptr_time(timed);
+//  nvlouvain::display_vec(k_d_ptr_d);
+
+
+
+
+  std::cout<<"raw_ptr_runtime: "<<raw_ptr_time<<"\niter_time: "<<iter_time<<"\ndev_ptr_time: "<<dev_ptr_time<<std::endl;
+  std::cout<<"============== complete k computation test =============\n";
+  
+}
+*/
diff --git a/cpp/nvgraph/cpp/include/test/k_in_test.cuh b/cpp/nvgraph/cpp/include/test/k_in_test.cuh
new file mode 100644
index 00000000000..5b4caaf9f04
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/test/k_in_test.cuh
@@ -0,0 +1,106 @@
+
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <string>
+#include "test_opt_utils.h"
+#include "graph_utils.cuh"
+#include "louvain.cuh"
+#include "gtest/gtest.h"
+#include "high_res_clock.h"
+#include "util.cuh"
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+
+template< typename IdxIter, typename ValIter, typename ValType >
+__global__ void kernal_k_in_test(const int size, IdxIter csr_ptr_iter, IdxIter csr_ind_iter, ValIter csr_val_iter, IdxIter cluster_iter, int i, ValType* result){
+/*
+  
+  //printf("successfully launch kernal\n");
+
+  int idx_x = blockDim.x*blockIdx.x + threadIdx.x;
+  int idx_y = blockDim.y*blockIdx.y + threadIdx.y;
+
+  if(idx_x < size && idx_y < size ){
+    
+    int c = *( cluster_iter + idx_y);  
+    //printf(" ** %d %d\n", idx_x, idx_y); 
+    //printf("start compute k with iter passing. (%d, %d, %d) idx = %d %f\n", blockDim.x, blockIdx.x, threadIdx.x, idx, result[idx]);
+    nvlouvain::compute_k_i_in(size, csr_ptr_iter, csr_ind_iter, csr_val_iter, cluster_iter, c, idx_x, &result[idx_x *size + idx_y ]);
+                       // n_vertex, csr_ptr_iter, csr_idx_iter, csr_val_iter, cluster_iter,      c,   i, result
+    printf("k_%d_in_c%d = %f\n", idx_x, idx_y ,result[idx_x *size + idx_y]);
+    
+  }
+*/
+/*
+  if(idx == 0){
+    nvlouvain::display_vec(csr_ptr_iter, size);  
+    nvlouvain::display_vec(csr_ind_iter, csr_ptr_iter[size]);
+    nvlouvain::display_vec(csr_val_iter, csr_ptr_iter[size]);
+
+  }
+*/
+  return;
+
+}
+
+
+void k_i_in_compute_test( thrust::device_vector<int> &csr_ptr_d,
+                          thrust::device_vector<int> &csr_ind_d,
+                          thrust::device_vector<T> &csr_val_d,
+                          int size){
+
+  HighResClock hr_clock;
+  double timed;
+
+  
+  dim3 block_size((size + BLOCK_SIZE_2D -1)/ BLOCK_SIZE_2D, (size + BLOCK_SIZE_2D -1)/ BLOCK_SIZE_2D, 1);
+  dim3 grid_size(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1);
+
+  std::cout<< csr_ptr_d.size()<<" "<<csr_val_d.size()<<" size:"<< size <<std::endl;
+  thrust::device_vector<T> result_d(size * size);
+  thrust::device_vector<int> cluster_d(size);
+
+  T* result_ptr = thrust::raw_pointer_cast(result_d.data());
+
+
+  hr_clock.start();
+  int i = 0; 
+  std::cout<<"successfully declair device vector.\n";
+  kernal_k_in_test<<<block_size, grid_size>>>(size, csr_ptr_d.begin(), csr_ind_d.begin(), csr_val_d.begin(), cluster_d.begin(), i, result_ptr);
+  CUDA_CALL(cudaDeviceSynchronize());
+
+  hr_clock.stop(&timed);
+  double iter_time(timed);
+  nvlouvain::display_vec(result_d);
+
+  std::cout<<"k_i_in runtime: "<<iter_time<<"\n";
+ std::cout<<"============== complete k_i_in computation test =============\n";
+ 
+}
+
+/*
+void k_i_in_compute_for_each_with_functor(thrust::device_vector<int> &csr_ptr_d,
+                                          thrust::device_vector<int> &csr_ind_d,
+                                          thrust::device_vector<T> &csr_val_d,
+                                          int size){
+  for_each_n() 
+}*/
diff --git a/cpp/nvgraph/cpp/include/test/mem_test.cuh b/cpp/nvgraph/cpp/include/test/mem_test.cuh
new file mode 100644
index 00000000000..e6717024093
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/test/mem_test.cuh
@@ -0,0 +1,98 @@
+
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <string>
+#include "test_opt_utils.h"
+#include "graph_utils.cuh"
+#include "louvain.cuh"
+#include "gtest/gtest.h"
+#include "high_res_clock.h"
+#include "util.cuh"
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+#include <thrust/memory.h>
+
+
+
+template<typename IdxType=int, typename ValType=double>
+__global__ void
+kernel_local_mem(const int n_vertex ){
+
+  thrust::device_system_tag device_sys;
+  thrust::pointer<ValType,thrust::device_system_tag> temp_i = thrust::malloc<ValType>(device_sys, n_vertex); // for weight on i and for sum_k
+  thrust::pointer<IdxType,thrust::device_system_tag> temp_idx = thrust::malloc<IdxType>(device_sys, n_vertex); // for weight on i and for sum_k
+
+  
+
+  *temp_i = 10.0;
+  *(temp_i + n_vertex-1) = 100.5;
+  
+  thrust::return_temporary_buffer(device_sys, temp_idx);
+  thrust::return_temporary_buffer(device_sys, temp_i);
+}
+
+template<typename IdxType=int, typename ValType=double>
+__global__ void
+kernel_local_mem_new(const int n_vertex ){
+
+  ValType * temp_i = new ValType[n_vertex];
+  IdxType * temp_idx = new IdxType[n_vertex];
+ 
+
+  *temp_i = 10.0;
+  *(temp_i + n_vertex-1) = 100.5;
+  thrust::sequence(thrust::cuda::par, temp_idx, temp_idx + n_vertex);
+  printf("%d %d %d ... %d\n",*temp_idx, *(temp_idx+1), *(temp_idx+2), *(temp_idx + n_vertex - 1) );
+
+  delete [] temp_i;    
+  delete [] temp_idx;
+}
+
+
+
+
+void mem_allocate_test(const int size){
+ 
+ 
+  HighResClock hr_clock;
+  double timed;
+
+ 
+  dim3 block_size((size + BLOCK_SIZE_1D -1)/ BLOCK_SIZE_1D, 1, 1);
+  dim3 grid_size(BLOCK_SIZE_1D, 1, 1);
+  hr_clock.start();
+
+  kernel_local_mem<<<block_size,grid_size>>>(30000);
+
+  kernel_local_mem_new<<<block_size,grid_size>>>(30000);
+
+
+  CUDA_CALL(cudaDeviceSynchronize());
+  hr_clock.stop(&timed); 
+  double raw_ptr_time(timed);  
+
+  std::cout<<"allocate_mem_runtime: "<<raw_ptr_time<<std::endl;
+
+
+ 
+   
+}
diff --git a/cpp/nvgraph/cpp/include/test/modularity_test.cuh b/cpp/nvgraph/cpp/include/test/modularity_test.cuh
new file mode 100644
index 00000000000..21547199e9a
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/test/modularity_test.cuh
@@ -0,0 +1,241 @@
+
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <fstream>
+#include <string>
+#include "test_opt_utils.h"
+#include "graph_utils.cuh"
+#include "louvain.cuh"
+#include "gtest/gtest.h"
+#include "high_res_clock.h"
+#include "util.cuh"
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+
+  
+void modularity_test_no_matrix(thrust::device_vector<int> &csr_ptr_d,
+                     thrust::device_vector<int> &csr_ind_d,
+                     thrust::device_vector<T> &csr_val_d,
+                     const int size, 
+                     const bool weighted){
+  
+
+  HighResClock hr_clock;
+  double timed;
+
+ 
+ 
+  dim3 block_size((size + BLOCK_SIZE_1D -1)/ BLOCK_SIZE_1D, 1, 1);
+  dim3 grid_size(BLOCK_SIZE_1D, 1, 1); 
+
+
+  std::cout<<"n_vertex: "<<size<<std::endl;
+  std::vector<int> cluster;
+  
+ 
+  thrust::device_vector<int> cluster_d(size);
+//  thrust::sequence(cluster_d.begin(), cluster_d.end());  
+  
+//  std::cout<<"cluster: ";
+  //nvlouvain::display_vec(cluster_d);
+
+  thrust::device_vector<T> score(1);
+  thrust::device_vector<T> k_vec(size);
+  thrust::device_vector<T> Q_arr(size);
+  thrust::device_vector<T> temp_i(csr_ptr_d[size]);
+  thrust::device_vector<int> cluster_inv_ptr(size+1);
+  thrust::device_vector<int> cluster_inv_ind(size);
+  thrust::sequence(thrust::cuda::par, cluster_inv_ptr.begin(), cluster_inv_ptr.end());
+  thrust::sequence(thrust::cuda::par, cluster_inv_ind.begin(), cluster_inv_ind.end());
+  thrust::fill(thrust::device, temp_i.begin(), temp_i.end(), 0.0);
+
+//  nvlouvain::display_vec(temp_i);
+
+  T* score_ptr = thrust::raw_pointer_cast(score.data());
+  T* k_vec_ptr = thrust::raw_pointer_cast(k_vec.data());
+  T* Q_arr_ptr = thrust::raw_pointer_cast(Q_arr.data());
+  T* temp_i_ptr = thrust::raw_pointer_cast(temp_i.data());
+  int* csr_ptr_ptr = thrust::raw_pointer_cast(csr_ptr_d.data());
+  int* csr_ind_ptr = thrust::raw_pointer_cast(csr_ind_d.data());
+  T* csr_val_ptr = thrust::raw_pointer_cast(csr_val_d.data());
+  int* cluster_inv_ptr_ptr = thrust::raw_pointer_cast(cluster_inv_ptr.data());
+  int* cluster_inv_ind_ptr = thrust::raw_pointer_cast(cluster_inv_ind.data());
+  int* cluster_ptr = thrust::raw_pointer_cast(cluster_d.data());
+
+ 
+  
+  hr_clock.start();
+
+  T m2 = thrust::reduce(thrust::cuda::par, csr_val_d.begin(), csr_val_d.end());
+  nvlouvain::generate_cluster_inv(size, size, cluster_d.begin(), cluster_inv_ptr, cluster_inv_ind);
+
+  double Q = nvlouvain::modularity(size, csr_ptr_d[size], size, m2,
+                                   csr_ptr_ptr, csr_ind_ptr, csr_val_ptr,
+                                   cluster_ptr, cluster_inv_ptr_ptr, cluster_inv_ind_ptr,
+                                   weighted, k_vec_ptr, Q_arr_ptr, temp_i_ptr);
+/* 
+  nvlouvain::kernel_modularity_no_matrix<<<block_size, grid_size >>>(size, size, m2,
+                                                                     csr_ptr_d.begin(), csr_ind_d.begin(), csr_val_d.begin(), 
+                                                                     cluster_d.begin(), cluster_inv_ptr.begin(), cluster_inv_ind.begin(),
+                                                                     weighted, k_vec_ptr, Q_arr_ptr, temp_i_ptr, score_ptr);
+
+     
+  CUDA_CALL(cudaDeviceSynchronize());
+  double Q = score[0];
+*/
+  hr_clock.stop(&timed);
+  double mod_time(timed);
+  printf("modularity(w/o block): %.10e  runtime: ",Q);
+  std::cout<<mod_time<<std::endl;
+ 
+
+  /* 
+  for(auto const & it:Q_arr) {
+    std::cout<<it<<" ,";
+  }
+  std::cout<<std::endl;
+*/
+} 
+void modularity_test_no_matrix_block(thrust::device_vector<int> &csr_ptr_d,
+                     thrust::device_vector<int> &csr_ind_d,
+                     thrust::device_vector<T> &csr_val_d,
+                     const int size, 
+                     const bool weighted){
+  
+  HighResClock hr_clock;
+  double timed;
+   
+  dim3 block_size((size + BLOCK_SIZE_1D -1)/ BLOCK_SIZE_1D, 1, 1);
+  dim3 grid_size(BLOCK_SIZE_1D, 1, 1); 
+
+  std::cout<<"n_vertex: "<<size<<std::endl;
+ 
+  thrust::device_vector<int> cluster_d(size);
+  thrust::sequence(cluster_d.begin(), cluster_d.end());  
+  //std::cout<<"cluster: ";
+  //nvlouvain::display_vec(cluster_d);
+
+  thrust::device_vector<T> score(1);
+  thrust::device_vector<T> k_vec(size);
+  thrust::device_vector<T> Q_arr(size);
+
+  T* score_ptr = thrust::raw_pointer_cast(score.data());
+  T* k_vec_ptr = thrust::raw_pointer_cast(k_vec.data());
+  T* Q_arr_ptr = thrust::raw_pointer_cast(Q_arr.data());
+
+  int n_edges = csr_ptr_d[size];
+  T m2 = thrust::reduce(thrust::cuda::par, csr_val_d.begin(), csr_val_d.end()+ n_edges);
+
+  hr_clock.start();
+
+  
+  nvlouvain::kernel_modularity_no_matrix_block<<<block_size, grid_size>>>(size, m2, csr_ptr_d.begin(), csr_ind_d.begin(), csr_val_d.begin(), 
+                                                                          cluster_d.begin(), 
+                                                                          weighted, k_vec_ptr, Q_arr_ptr);
+ 
+  CUDA_CALL(cudaDeviceSynchronize());
+
+  hr_clock.stop(&timed);
+  double mod_time(timed);
+  double Q = thrust::reduce(thrust::cuda::par, Q_arr_ptr, Q_arr_ptr + size, (0.0));     
+
+  printf("modularity(w/  block): %.10e  runtime: ",Q);
+  std::cout<<mod_time<<std::endl;
+/*
+ 
+  for(auto const & it:Q_arr) {
+    std::cout<<it<<" ,";
+  }
+  std::cout<<std::endl;
+*/
+
+}
+ 
+  /* 
+void modularity_test_no_matrix(std::string file_name){
+  
+
+  HighResClock hr_clock;
+  double timed;
+  std::ifstream inf(file_name);
+
+ 
+  thrust::device_vector<int> csr_ptr_d; 
+  thrust::device_vector<int> csr_ind_d,
+  thrust::device_vector<T> csr_val_d;
+  const int size;
+  bool weighted = truel
+  dim3 block_size((size + BLOCK_SIZE_1D -1)/ BLOCK_SIZE_1D, 1, 1);
+  dim3 grid_size(BLOCK_SIZE_1D, 1, 1); 
+
+
+  std::cout<<"n_vertex: "<<size<<std::endl;
+  std::vector<int> cluster;
+  
+ 
+  thrust::device_vector<int> cluster_d(size);
+//  thrust::sequence(cluster_d.begin(), cluster_d.end());  
+  
+//  std::cout<<"cluster: ";
+  //nvlouvain::display_vec(cluster_d);
+
+  thrust::device_vector<T> score(1);
+  thrust::device_vector<T> k_vec(size);
+  thrust::device_vector<T> Q_arr(size);
+  thrust::device_vector<T> temp_i(csr_ptr_d[size]);
+  thrust::device_vector<int> cluster_inv_ptr(size+1);
+  thrust::device_vector<int> cluster_inv_ind(size);
+  thrust::sequence(thrust::cuda::par, cluster_inv_ptr.begin(), cluster_inv_ptr.end());
+  thrust::sequence(thrust::cuda::par, cluster_inv_ind.begin(), cluster_inv_ind.end());
+  thrust::fill(thrust::device, temp_i.begin(), temp_i.end(), 0.0);
+
+//  nvlouvain::display_vec(temp_i);
+
+  T* score_ptr = thrust::raw_pointer_cast(score.data());
+  T* k_vec_ptr = thrust::raw_pointer_cast(k_vec.data());
+  T* Q_arr_ptr = thrust::raw_pointer_cast(Q_arr.data());
+  T* temp_i_ptr = thrust::raw_pointer_cast(temp_i.data());
+  
+  hr_clock.start();
+
+  T m2 = thrust::reduce(thrust::cuda::par, csr_val_d.begin(), csr_val_d.end());
+  nvlouvain::generate_cluster_inv(size, c_size, cluster_d.begin(), cluster_inv_ptr, cluster_inv_ind);
+
+  double Q = nvlouvain::modularity(size, size, m2,
+                                   csr_ptr_d, csr_ind_d, csr_val_d,
+                                   cluster_d, cluster_inv_ptr, cluster_inv_ind,
+                                   weighted, k_vec_ptr, Q_arr_ptr, temp_i_ptr);
+  nvlouvain::kernel_modularity_no_matrix<<<block_size, grid_size >>>(size, size, m2,
+                                                                     csr_ptr_d.begin(), csr_ind_d.begin(), csr_val_d.begin(), 
+                                                                     cluster_d.begin(), cluster_inv_ptr.begin(), cluster_inv_ind.begin(),
+                                                                     weighted, k_vec_ptr, Q_arr_ptr, temp_i_ptr, score_ptr);
+
+     
+  CUDA_CALL(cudaDeviceSynchronize());
+  double Q = score[0];
+  hr_clock.stop(&timed);
+  double mod_time(timed);
+  printf("modularity(w/o block): %.10e  runtime: ",Q);
+  std::cout<<mod_time<<std::endl;
+ 
+}
+*/
diff --git a/cpp/nvgraph/cpp/include/test/phase_1_color_test.cuh b/cpp/nvgraph/cpp/include/test/phase_1_color_test.cuh
new file mode 100644
index 00000000000..eeb47a2f7b8
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/test/phase_1_color_test.cuh
@@ -0,0 +1,169 @@
+
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "back_up.cuh"
+
+template<typename IdxIter, typename ValIter, typename IdxType=int,  typename ValType>
+__global__ void 
+kernel_phase_1_color(const int n_vertex, IdxIter csr_ptr_iter, IdxIter csr_ind_iter, ValIter csr_val_iter, IdxIter cluster,  
+               IdxType* color, IdxType color_size,ValType *matrix, IdxType *cluster_sizes, ValType* improve, IdxType* n_moved){
+
+
+
+  *n_moved = 0;
+  IdxType j = blockIdx.x * blockDim.x + threadIdx.x;
+  IdxType i = blockIdx.y * blockDim.y + threadIdx.y;
+ 
+  for( int t = 0; t < color_size; ++t ){ // color t
+    if( i< n_vertex && color[i] == t ){
+      IdxType start_idx = *(csr_ptr_iter + i);
+      IdxType end_idx = *(csr_ptr_iter + i + 1);
+      if(j < end_idx - start_idx){
+        IdxType c = cluster[ csr_ind_iter[start_idx + j]];
+        //printf("i:%d j:%d start:%d end:%d c:%d\n",i,j,start_idx, end_idx,c);
+        nvlouvain::phase_1( n_vertex,
+                            csr_ptr_iter,
+                            csr_ind_iter,
+                            csr_val_iter,
+                            cluster,
+                            i,
+                            j,
+                            c,
+                            matrix, 
+                            cluster_sizes, 
+                            improve, n_moved);
+      }
+    
+    }
+  }
+}
+
+/*
+void phase_1_color_test(thrust::device_vector<int> &csr_ptr_d,
+                     thrust::device_vector<int> &csr_ind_d,
+                     thrust::device_vector<T> &csr_val_d,
+                     const int size){
+
+  HighResClock hr_clock;
+  double timed;
+  
+  dim3 block_size((size + BLOCK_SIZE_2D -1)/ BLOCK_SIZE_2D, (size + BLOCK_SIZE_2D -1)/ BLOCK_SIZE_2D, 1);
+  dim3 grid_size(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1); 
+
+ 
+  
+  thrust::device_vector<int> cluster_d(size);
+  thrust::sequence(cluster_d.begin(), cluster_d.end());  
+
+  std::cout<<"old cluster: ";
+  nvlouvain::display_vec(cluster_d);
+
+  thrust::device_vector<T> Q_d(1);
+  T* Q_d_raw_ptr = thrust::raw_pointer_cast(Q_d.data());
+
+  thrust::device_vector<T> matrix(size*size);
+  T* matrix_raw_ptr = thrust::raw_pointer_cast(matrix.data());
+
+  hr_clock.start();
+
+  kernel_modularity<<<block_size, grid_size>>>(size, csr_ptr_d.begin(), csr_ind_d.begin(), csr_val_d.begin(), cluster_d.begin(), matrix_raw_ptr, Q_d_raw_ptr);
+
+ 
+  CUDA_CALL(cudaDeviceSynchronize());
+
+  hr_clock.stop(&timed);
+  double mod_time(timed);
+  std::cout<<"modularity: "<<Q_d[0]<<" runtime: "<<mod_time<<std::endl;
+
+ 
+  thrust::device_vector<T> improve_d(1);
+  T* improve_d_raw_ptr = thrust::raw_pointer_cast(improve_d.data());
+  
+  thrust::device_vector<int> c_size_d(size, 1);
+  int* c_size_d_raw_ptr = thrust::raw_pointer_cast(c_size_d.data());
+ 
+  thrust::device_vector<int> n_moved(1, 0);
+  int* n_moved_ptr =  thrust::raw_pointer_cast(n_moved.data());
+//  nvlouvain::display_vec(c_size_d);
+
+  //--------------------------------  1st -
+  thrust::device_vector<T> Q_old(Q_d);
+  double delta_Q;
+
+  int count = 0;
+  int num_move = 0;
+  int color_size;
+  
+  std::vector<int> fill_color(size);
+  if(size == 16){ 
+    fill_color = {0, 0, 1, 2, 2, 2, 0, 1, 2, 0, 0, 1, 1, 2, 1, 0};
+    color_size = 3;
+  }
+  else if(size == 4){
+    fill_color =  {0, 1, 2, 0};
+    color_size = 3;
+  }
+
+
+  thrust::device_vector<int> color(fill_color);
+
+  int* color_ptr = thrust::raw_pointer_cast(color.data());
+  
+  do{
+    Q_old[0] = Q_d[0]; 
+    hr_clock.start();
+
+    kernel_phase_1_color<<<block_size, grid_size>>>(size, 
+                                              csr_ptr_d.begin(), 
+                                              csr_ind_d.begin(), 
+                                              csr_val_d.begin(), 
+                                              cluster_d.begin(), 
+                                              color_ptr,
+                                              color_size,
+                                              matrix_raw_ptr, 
+                                              c_size_d_raw_ptr,
+                                              improve_d_raw_ptr, 
+                                              n_moved_ptr);
+  
+    CUDA_CALL(cudaDeviceSynchronize());
+  
+    hr_clock.stop(&timed);
+    mod_time = timed;
+    std::cout<<"new cluster: ";
+    nvlouvain::display_vec(cluster_d);  
+    std::cout<<"improvement: "<<improve_d[0]<<" runtime: "<<mod_time<<std::endl;
+  
+    kernel_modularity<<<block_size, grid_size>>>(size, csr_ptr_d.begin(), csr_ind_d.begin(), csr_val_d.begin(), cluster_d.begin(), matrix_raw_ptr, Q_d_raw_ptr);
+    CUDA_CALL(cudaDeviceSynchronize());
+   
+
+    delta_Q = Q_d[0] - Q_old[0]; 
+    std::cout<<"new modularity: "<<Q_d[0]<<" delta_Q:"<<delta_Q<<" runtime: "<<mod_time<<std::endl;
+    std::cout<<"cluster size: ";
+    nvlouvain::display_vec(c_size_d);
+ 
+    int sum = thrust::reduce(thrust::cuda::par, c_size_d.begin(), c_size_d.end(), 0);
+    num_move = n_moved[0];
+    std::cout<<"sum: "<< sum<<" moved: "<<num_move<<std::endl;
+    
+
+    ++count;
+  }while(  num_move > 0 );
+
+}
+*/
+
diff --git a/cpp/nvgraph/cpp/include/test/phase_1_test.cuh b/cpp/nvgraph/cpp/include/test/phase_1_test.cuh
new file mode 100644
index 00000000000..68aac0077e9
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/test/phase_1_test.cuh
@@ -0,0 +1,158 @@
+
+
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <string>
+#include "test_opt_utils.h"
+#include "graph_utils.cuh"
+#include "louvain.cuh"
+#include "gtest/gtest.h"
+#include "high_res_clock.h"
+#include "util.cuh"
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+
+
+template<typename IdxIter, typename ValIter, typename IdxType=int,  typename ValType>
+__global__ void 
+kernel_phase_1(const int n_vertex, IdxIter csr_ptr_iter, IdxIter csr_ind_iter, ValIter csr_val_iter, IdxIter cluster,  
+               ValType *matrix, IdxType *cluster_sizes, ValType* improve, IdxType* n_moved){
+
+
+
+  *n_moved = 0;
+  IdxType j = blockIdx.x * blockDim.x + threadIdx.x;
+  IdxType i = blockIdx.y * blockDim.y + threadIdx.y;
+  
+  if( i< n_vertex && j<n_vertex){
+      //printf("i:%d j:%d start:%d end:%d c:%d\n",i,j,start_idx, end_idx,c);
+      nvlouvain::phase_1( n_vertex,
+                          csr_ptr_iter,
+                          csr_ind_iter,
+                          csr_val_iter,
+                          cluster,
+                          i,
+                          j,
+                          j,
+                          matrix, 
+                          cluster_sizes, 
+                          improve, n_moved);
+  }
+}
+
+
+
+void phase_1_test(thrust::device_vector<int> &csr_ptr_d,
+                     thrust::device_vector<int> &csr_ind_d,
+                     thrust::device_vector<T> &csr_val_d,
+                     const int size){
+/*
+  HighResClock hr_clock;
+  double timed;
+  
+  dim3 block_size((size + BLOCK_SIZE_2D -1)/ BLOCK_SIZE_2D, (size + BLOCK_SIZE_2D -1)/ BLOCK_SIZE_2D, 1);
+  dim3 grid_size(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1); 
+
+ 
+  
+  thrust::device_vector<int> cluster_d(size);
+  thrust::sequence(cluster_d.begin(), cluster_d.end());  
+
+  std::cout<<"old cluster: ";
+  //nvlouvain::display_vec(cluster_d);
+
+  thrust::device_vector<T> Q_d(1);
+  T* Q_d_raw_ptr = thrust::raw_pointer_cast(Q_d.data());
+
+  thrust::device_vector<T> matrix(size*size);
+  T* matrix_raw_ptr = thrust::raw_pointer_cast(matrix.data());
+
+  hr_clock.start();
+
+  kernel_modularity<<<block_size, grid_size>>>(size, csr_ptr_d.begin(), csr_ind_d.begin(), csr_val_d.begin(), cluster_d.begin(), matrix_raw_ptr, Q_d_raw_ptr);
+
+ 
+  CUDA_CALL(cudaDeviceSynchronize());
+
+  hr_clock.stop(&timed);
+  double mod_time(timed);
+  std::cout<<"modularity: "<<Q_d[0]<<" runtime: "<<mod_time<<std::endl;
+
+ 
+  thrust::device_vector<T> improve_d(1);
+  T* improve_d_raw_ptr = thrust::raw_pointer_cast(improve_d.data());
+  
+  thrust::device_vector<int> c_size_d(size, 1);
+  int* c_size_d_raw_ptr = thrust::raw_pointer_cast(c_size_d.data());
+ 
+  thrust::device_vector<int> n_moved(1, 0);
+  int* n_moved_ptr =  thrust::raw_pointer_cast(n_moved.data());
+
+  //--------------------------------  1st -
+  thrust::device_vector<T> Q_old(Q_d);
+  double delta_Q;
+
+  int count = 0;
+  int num_move = 0;
+  do{
+    Q_old[0] = Q_d[0]; 
+    hr_clock.start();
+
+    kernel_phase_1<<<block_size, grid_size>>>(size, 
+                                              csr_ptr_d.begin(), 
+                                              csr_ind_d.begin(), 
+                                              csr_val_d.begin(), 
+                                              cluster_d.begin(), 
+                                              matrix_raw_ptr, 
+                                              c_size_d_raw_ptr,
+                                              improve_d_raw_ptr, 
+                                              n_moved_ptr);
+  
+    CUDA_CALL(cudaDeviceSynchronize());
+  
+    hr_clock.stop(&timed);
+    mod_time = timed;
+    std::cout<<"new cluster: ";
+    //nvlouvain::display_vec(cluster_d);  
+    std::cout<<"improvement: "<<improve_d[0]<<" runtime: "<<mod_time<<std::endl;
+  
+    kernel_modularity<<<block_size, grid_size>>>(size, csr_ptr_d.begin(), csr_ind_d.begin(), csr_val_d.begin(), cluster_d.begin(), matrix_raw_ptr, Q_d_raw_ptr);
+    CUDA_CALL(cudaDeviceSynchronize());
+   
+
+    delta_Q = Q_d[0] - Q_old[0]; 
+    std::cout<<"new modularity: "<<Q_d[0]<<" delta_Q:"<<delta_Q<<" runtime: "<<mod_time<<std::endl;
+    std::cout<<"cluster size: ";
+    nvlouvain::display_vec(c_size_d);
+ 
+    int sum = thrust::reduce(thrust::cuda::par, c_size_d.begin(), c_size_d.end(), 0);
+    num_move = n_moved[0];
+    std::cout<<"sum: "<< sum<<" moved: "<<num_move<<std::endl;
+    
+
+    ++count;
+  }while( num_move > 0 && count <5);
+*/
+}
+
+
+
diff --git a/cpp/nvgraph/cpp/include/test/thrust_test.cuh b/cpp/nvgraph/cpp/include/test/thrust_test.cuh
new file mode 100644
index 00000000000..aaa6b32e889
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/test/thrust_test.cuh
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <string>
+#include "test_opt_utils.h"
+#include "graph_utils.cuh"
+#include "louvain.cuh"
+#include "gtest/gtest.h"
+#include "high_res_clock.h"
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+
+
+
+
+template<typename iter, typename ptr >
+__global__ void test_sum(iter begin, iter end, ptr sum){
+
+  thrust::plus<T> op;
+  *sum = thrust::reduce(thrust::cuda::par, begin, end, 0.0, op);
+  
+}
+
+__global__ void test_sum_cast(T* vec, size_t size, T* sum){
+
+  thrust::plus<T> op;
+  *sum = thrust::reduce(thrust::cuda::par, vec, vec+size, 0.0, op);
+  
+}
+
+
+void thrust_passing_arg_test( thrust::host_vector<int> &csr_ptr_h, 
+                              thrust::host_vector<int> &csr_ind_h,
+                              thrust::host_vector<T> &csr_val_h,
+                              thrust::device_vector<int> &csr_ptr_d, 
+                              thrust::device_vector<int> &csr_ind_d, 
+                              thrust::device_vector<T> &csr_val_d){
+
+  HighResClock hr_clock;
+  double timed;
+  
+  thrust::plus<T> binary_op;
+  hr_clock.start();
+  T sum_h = thrust::reduce(csr_val_h.begin(), csr_val_h.end(), 0.0, binary_op); 
+  hr_clock.stop(&timed);
+  double cpu_time(timed);
+ 
+  
+  
+  thrust::copy(csr_val_d.begin(), csr_val_d.end(), std::ostream_iterator<float>(std::cout, " "));
+  std::cout<<std::endl;
+
+
+  dim3 block_size(1, 1, 1);
+  dim3 grid_size(1, 1, 1);
+  
+
+  hr_clock.start();
+  T sum_r = thrust::reduce(csr_val_d.begin(), csr_val_d.end(), 0.0, binary_op);
+
+  hr_clock.stop(&timed);
+  double r_time(timed);
+
+
+
+  hr_clock.start();
+  thrust::device_vector<T> sum_d(1, 0.0);
+  test_sum<<<block_size,grid_size>>>( csr_val_d.begin(),csr_val_d.end(), sum_d.data());
+  CUDA_CALL(cudaDeviceSynchronize());
+  hr_clock.stop(&timed);
+  double cuda_time(timed);
+
+  
+  hr_clock.start();
+  cudaStream_t s;
+  thrust::device_vector<T> sum_a(1, 0.0);
+  cudaStreamCreate(&s);
+  test_sum<<<1,1,0,s>>>(csr_val_d.begin(),csr_val_d.end(), sum_a.data());
+  cudaStreamSynchronize(s);
+  hr_clock.stop(&timed);
+  double asyn_time(timed);
+
+
+
+  hr_clock.start();
+  T* csr_val_ptr = thrust::raw_pointer_cast(csr_val_d.data());
+  double* raw_sum;  
+  double sum_cast;
+  cudaMalloc((void **) &raw_sum, sizeof(double));
+  test_sum_cast<<<block_size,grid_size>>>( csr_val_ptr, csr_val_d.size(), raw_sum);
+  cudaMemcpy(&sum_cast, raw_sum, sizeof(double),cudaMemcpyDeviceToHost);
+  CUDA_CALL(cudaDeviceSynchronize());
+  hr_clock.stop(&timed);
+  double cast_time(timed);
+  cudaFree(raw_sum);
+
+
+
+  
+  std::cout<<"cpu    sum of val: "<< sum_h <<" runtime: "<<cpu_time<<std::endl;
+  std::cout<<"device sum of val: "<< sum_r <<" runtime: "<<r_time<<std::endl;
+  std::cout<<"kernel sum of val: "<< sum_d[0] <<" runtime: "<<cuda_time<<std::endl;
+  std::cout<<"async  sum of val: "<< sum_a[0] <<" runtime: "<<asyn_time<<std::endl;
+  std::cout<<"cast:  sum of val: "<< sum_cast <<" runtime: "<<cast_time<<std::endl;
+
+} 
diff --git a/cpp/nvgraph/cpp/include/test_opt_utils.cuh b/cpp/nvgraph/cpp/include/test_opt_utils.cuh
new file mode 100644
index 00000000000..332796169fc
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/test_opt_utils.cuh
@@ -0,0 +1,511 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <stdlib.h>  
+#include <stddef.h>
+#include <string>
+#include <sstream>
+#include <iostream>
+#include <iomanip>
+#include <algorithm>
+#include <limits>
+#include <utility>
+#include <cstdint>
+extern "C" {
+#include "mmio.h"
+}
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_profiler_api.h>
+#include <library_types.h>
+#include <thrust/host_vector.h>
+#include <thrust/adjacent_difference.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+#include <thrust/device_vector.h>
+#include <thrust/sequence.h>
+
+
+#define CUDACHECK(cudaCall)         \
+  do {                \
+    cudaError_t e = (cudaCall);         \
+    if(e != cudaSuccess) {          \
+      fprintf(stderr, "CUDA Error (%s:%d): %s\n",   \
+        __FILE__, __LINE__, cudaGetErrorString(e)); \
+    }               \
+  } while(0)
+
+
+std::string getFileName(const std::string& s) {
+
+   char sep = '/';
+
+#ifdef _WIN32
+   sep = '\\';
+#endif
+
+   size_t i = s.rfind(sep, s.length());
+   if (i != std::string::npos) {
+      return(s.substr(i+1, s.length() - i));
+   }
+
+   return("");
+}
+
+template <typename T> 
+void verbose_diff(std::vector<T> & v1, std::vector<T> & v2) {
+  for (unsigned int i = 0; i < v1.size(); ++i)
+  {
+    if (v1[i] != v2[i])
+    {
+      std::cout << "[" << i <<"] : " << v1[i] << " -- ref = "<< v2[i]<<std::endl;
+    }
+  }
+}
+
+template <typename T> 
+int eq(std::vector<T> & v1, std::vector<T> & v2) {
+    if (v1 == v2)
+        return 0; 
+    else  {
+        verbose_diff(v1,v2);
+        return 1;
+    }
+}
+
+template <typename T>
+void printv(size_t n, T* vec, int offset) {
+    thrust::device_ptr<T> dev_ptr(vec);
+    std::cout.precision(15);
+    std::cout << "sample size = "<< n << ", offset = "<< offset << std::endl;
+    thrust::copy(dev_ptr+offset,dev_ptr+offset+n, std::ostream_iterator<T>(std::cout, " "));
+    std::cout << std::endl;
+}
+
+template <typename T_ELEM>
+void ref_csr2csc (int m, int n, int nnz, const T_ELEM *csrVals, const int *csrRowptr, const int *csrColInd, T_ELEM *cscVals, int *cscRowind, int *cscColptr, int base=0){
+    int i,j, row, col, index;
+    int * counters;
+    T_ELEM val;
+
+    /* early return */
+    if ((m <= 0) || (n <= 0) || (nnz <= 0)){
+        return;
+    }
+
+    /* build compressed column pointers */
+    memset(cscColptr, 0, (n+1)*sizeof(cscColptr[0]));
+    cscColptr[0]=base;
+    for (i=0; i<nnz; i++){
+        cscColptr[1+csrColInd[i]-base]++;
+    }
+    for(i=0; i<n; i++){
+        cscColptr[i+1]+=cscColptr[i];
+    }
+
+    /* expand row indecis and copy them and values into csc arrays according to permutation */
+    counters = (int *)malloc(n*sizeof(counters[0]));
+    memset(counters, 0, n*sizeof(counters[0]));
+    for (i=0; i<m; i++){
+        for (j=csrRowptr[i]; j<csrRowptr[i+1]; j++){
+            row = i+base;
+            col = csrColInd[j-base];
+
+            index=cscColptr[col-base]-base+counters[col-base];
+            counters[col-base]++;
+
+            cscRowind[index]=row;
+
+            if(csrVals!=NULL || cscVals!=NULL){
+                val = csrVals[j-base];
+                cscVals[index]  = val;
+            }
+        }
+    }
+    free(counters);
+}
+
+template <typename T> 
+int transition_matrix_cpu(int n, int e, int *csrRowPtrA, int *csrColIndA, T *weight, T* is_leaf) 
+//omp_set_num_threads(4);
+//#pragma omp parallel
+ {
+    int j,row, row_size;
+    //#pragma omp for
+    for (row=0; row<n; row++) 
+    {  
+        row_size = csrRowPtrA[row+1] - csrRowPtrA[row];
+        if (row_size == 0)
+            is_leaf[row]=1.0;
+        else
+        {
+            is_leaf[row]=0.0;
+            for (j=csrRowPtrA[row]; j<csrRowPtrA[row+1]; j++) 
+                weight[j] = 1.0/row_size; 
+        } 
+    }
+    return 0;
+}
+
+
+/// Read matrix properties from Matrix Market file
+/** Matrix Market file is assumed to be a sparse matrix in coordinate
+ *  format.
+ *
+ *  @param f File stream for Matrix Market file.
+ *  @param tg Boolean indicating whether to convert matrix to general
+ *  format (from symmetric, Hermitian, or skew symmetric format).
+ *  @param t (Output) MM_typecode with matrix properties.
+ *  @param m (Output) Number of matrix rows.
+ *  @param n (Output) Number of matrix columns.
+ *  @param nnz (Output) Number of non-zero matrix entries.
+ *  @return Zero if properties were read successfully. Otherwise
+ *  non-zero.
+ */
+template <typename IndexType_>
+int mm_properties(FILE * f, int tg, MM_typecode * t,
+		  IndexType_ * m, IndexType_ * n,
+		  IndexType_ * nnz) {
+
+  // Read matrix properties from file
+  int mint, nint, nnzint;
+  if(fseek(f,0,SEEK_SET)) {
+    fprintf(stderr, "Error: could not set position in file\n");
+    return -1;
+  }
+  if(mm_read_banner(f,t)) {
+    fprintf(stderr, "Error: could not read Matrix Market file banner\n");
+    return -1;
+  }
+  if(!mm_is_matrix(*t) || !mm_is_coordinate(*t)) {
+    fprintf(stderr, "Error: file does not contain matrix in coordinate format\n");
+    return -1;
+  }
+  if(mm_read_mtx_crd_size(f,&mint,&nint,&nnzint)) {
+    fprintf(stderr, "Error: could not read matrix dimensions\n");
+    return -1;
+  }
+  if(!mm_is_pattern(*t) && !mm_is_real(*t) &&
+     !mm_is_integer(*t) && !mm_is_complex(*t)) {
+    fprintf(stderr, "Error: matrix entries are not valid type\n");
+    return -1;
+  }
+  *m   = mint;
+  *n   = nint;
+  *nnz = nnzint;
+
+  // Find total number of non-zero entries
+  if(tg && !mm_is_general(*t)) {
+
+    // Non-diagonal entries should be counted twice
+    IndexType_ nnzOld = *nnz;
+    *nnz *= 2;
+
+    // Diagonal entries should not be double-counted
+    int i; int st;
+    for(i=0; i<nnzOld; ++i) {
+
+      // Read matrix entry
+      IndexType_ row, col;
+      double rval, ival;
+      if (mm_is_pattern(*t)) 
+          st = fscanf(f, "%d %d\n", &row, &col);
+      else if (mm_is_real(*t) || mm_is_integer(*t))
+          st = fscanf(f, "%d %d %lg\n", &row, &col, &rval);
+      else // Complex matrix
+          st = fscanf(f, "%d %d %lg %lg\n", &row, &col, &rval, &ival);
+      if(ferror(f) || (st == EOF)) {
+          fprintf(stderr, "Error: error %d reading Matrix Market file (entry %d)\n", st, i+1);
+          return -1;
+      }
+
+      // Check if entry is diagonal
+      if(row == col)
+	--(*nnz);
+
+    }
+  }
+
+  return 0;
+
+}
+
+/// Read Matrix Market file and convert to COO format matrix
+/** Matrix Market file is assumed to be a sparse matrix in coordinate
+ *  format.
+ *
+ *  @param f File stream for Matrix Market file.
+ *  @param tg Boolean indicating whether to convert matrix to general
+ *  format (from symmetric, Hermitian, or skew symmetric format).
+ *  @param nnz Number of non-zero matrix entries.
+ *  @param cooRowInd (Output) Row indices for COO matrix. Should have
+ *  at least nnz entries.
+ *  @param cooColInd (Output) Column indices for COO matrix. Should
+ *  have at least nnz entries.
+ *  @param cooRVal (Output) Real component of COO matrix
+ *  entries. Should have at least nnz entries. Ignored if null
+ *  pointer.
+ *  @param cooIVal (Output) Imaginary component of COO matrix
+ *  entries. Should have at least nnz entries. Ignored if null
+ *  pointer.
+ *  @return Zero if matrix was read successfully. Otherwise non-zero.
+ */
+template <typename IndexType_, typename ValueType_>
+int mm_to_coo(FILE *f, int tg, IndexType_ nnz,
+	      IndexType_ * cooRowInd, IndexType_ * cooColInd, 
+	      ValueType_ * cooRVal  , ValueType_ * cooIVal) {
+  
+  // Read matrix properties from file
+  MM_typecode t;
+  int m, n, nnzOld;
+  if(fseek(f,0,SEEK_SET)) {
+    fprintf(stderr, "Error: could not set position in file\n");
+    return -1;
+  }
+  if(mm_read_banner(f,&t)) {
+    fprintf(stderr, "Error: could not read Matrix Market file banner\n");
+    return -1;
+  }
+  if(!mm_is_matrix(t) || !mm_is_coordinate(t)) {
+    fprintf(stderr, "Error: file does not contain matrix in coordinate format\n");
+    return -1;
+  }
+  if(mm_read_mtx_crd_size(f,&m,&n,&nnzOld)) {
+    fprintf(stderr, "Error: could not read matrix dimensions\n");
+    return -1;
+  }
+  if(!mm_is_pattern(t) && !mm_is_real(t) &&
+     !mm_is_integer(t) && !mm_is_complex(t)) {
+    fprintf(stderr, "Error: matrix entries are not valid type\n");
+    return -1;
+  }
+
+  // Add each matrix entry in file to COO format matrix
+  IndexType_ i;      // Entry index in Matrix Market file
+  IndexType_ j = 0;  // Entry index in COO format matrix
+  for(i=0;i<nnzOld;++i) {
+
+    // Read entry from file
+    int row, col;
+    double rval, ival;
+    int st;
+    if (mm_is_pattern(t)) {
+      st = fscanf(f, "%d %d\n", &row, &col);
+      rval = 1.0;
+      ival = 0.0;
+    }
+    else if (mm_is_real(t) || mm_is_integer(t)) {
+      st = fscanf(f, "%d %d %lg\n", &row, &col, &rval);
+      ival = 0.0;
+    }
+    else // Complex matrix
+      st = fscanf(f, "%d %d %lg %lg\n", &row, &col, &rval, &ival);
+    if(ferror(f) || (st == EOF)) {
+        fprintf(stderr, "Error: error %d reading Matrix Market file (entry %d)\n", st, i+1);
+      return -1;
+    }
+
+    // Switch to 0-based indexing
+    --row;
+    --col;
+
+    // Record entry
+    cooRowInd[j] = row;
+    cooColInd[j] = col;
+    if(cooRVal != NULL)
+      cooRVal[j] = rval;
+    if(cooIVal != NULL)
+      cooIVal[j] = ival;
+    ++j;
+
+    // Add symmetric complement of non-diagonal entries
+    if(tg && !mm_is_general(t) && (row!=col)) {
+
+      // Modify entry value if matrix is skew symmetric or Hermitian
+      if(mm_is_skew(t)) {
+	rval = -rval;
+	ival = -ival;
+      }
+      else if(mm_is_hermitian(t)) {
+	ival = -ival;
+      }
+
+      // Record entry
+      cooRowInd[j] = col;
+      cooColInd[j] = row;
+      if(cooRVal != NULL)
+	cooRVal[j] = rval;
+      if(cooIVal != NULL)
+	cooIVal[j] = ival;
+      ++j;
+      
+    }
+  }
+  return 0;
+
+}
+
+/// Compare two tuples based on the element indexed by i
+class lesser_tuple {
+  const int i;
+public:
+  lesser_tuple(int _i) : i(_i) {}
+  template<typename Tuple1, typename Tuple2>
+  __host__ __device__
+  bool operator()(const Tuple1 t1, const Tuple2 t2) {
+    switch(i) {
+    case 0:  return (thrust::get<0>(t1) < thrust::get<0>(t2));
+    case 1:  return (thrust::get<1>(t1) < thrust::get<1>(t2));
+    default: return (thrust::get<0>(t1) < thrust::get<0>(t2));
+    }
+    
+  }
+};
+
+/// Sort entries in COO format matrix
+/** Sort is stable.
+ *
+ *  @param nnz Number of non-zero matrix entries.
+ *  @param sort_by_row Boolean indicating whether matrix entries
+ *  will be sorted by row index or by column index.
+ *  @param cooRowInd Row indices for COO matrix.
+ *  @param cooColInd Column indices for COO matrix.
+ *  @param cooRVal Real component for COO matrix entries. Ignored if
+ *  null pointer.
+ *  @param cooIVal Imaginary component COO matrix entries. Ignored if
+ *  null pointer.
+ */
+template <typename IndexType_, typename ValueType_>
+void coo_sort(IndexType_ nnz, int sort_by_row,
+	      IndexType_ * cooRowInd,
+	      IndexType_ * cooColInd, 
+	      ValueType_ * cooRVal,
+	      ValueType_ * cooIVal) {
+
+  // Determine whether to sort by row or by column
+  int i;
+  if(sort_by_row == 0)
+    i = 1;
+  else
+    i = 0;
+
+  // Apply stable sort
+  using namespace thrust;
+  if((cooRVal==NULL) && (cooIVal==NULL))
+    stable_sort(make_zip_iterator(make_tuple(cooRowInd,cooColInd)),
+		make_zip_iterator(make_tuple(cooRowInd+nnz,cooColInd+nnz)),
+		lesser_tuple(i));
+  else if((cooRVal==NULL) && (cooIVal!=NULL))
+    stable_sort(make_zip_iterator(make_tuple(cooRowInd,cooColInd,cooIVal)),
+		make_zip_iterator(make_tuple(cooRowInd+nnz,cooColInd+nnz,cooIVal+nnz)),
+		lesser_tuple(i));
+  else if((cooRVal!=NULL) && (cooIVal==NULL))
+    stable_sort(make_zip_iterator(make_tuple(cooRowInd,cooColInd,cooRVal)),
+		make_zip_iterator(make_tuple(cooRowInd+nnz,cooColInd+nnz,cooRVal+nnz)),
+		lesser_tuple(i));
+  else
+    stable_sort(make_zip_iterator(make_tuple(cooRowInd,cooColInd,cooRVal,cooIVal)),
+		make_zip_iterator(make_tuple(cooRowInd+nnz,cooColInd+nnz,
+					     cooRVal+nnz,cooIVal+nnz)),
+		lesser_tuple(i));
+}
+
+/// Compress sorted list of indices
+/** For use in converting COO format matrix to CSR or CSC format.
+ *
+ *  @param n Maximum index.
+ *  @param nnz Number of non-zero matrix entries.
+ *  @param sortedIndices Sorted list of indices (COO format).
+ *  @param compressedIndices (Output) Compressed list of indices (CSR
+ *  or CSC format). Should have at least n+1 entries.
+ */
+template <typename IndexType_>
+void coo_compress(IndexType_ m, IndexType_ n, IndexType_ nnz,
+		  const IndexType_ * __restrict__ sortedIndices,
+		  IndexType_ * __restrict__ compressedIndices) {
+  IndexType_ i;
+
+  // Initialize everything to zero
+  memset(compressedIndices, 0, (m+1)*sizeof(IndexType_));
+  
+  // Count number of elements per row
+  for(i=0; i<nnz; ++i)
+    ++(compressedIndices[sortedIndices[i]+1]);
+  
+  // Compute cumulative sum to obtain row offsets/pointers
+  for(i=0; i<m; ++i)
+    compressedIndices[i+1] += compressedIndices[i];
+
+}
+
+/// Convert COO format matrix to CSR format
+/** On output, matrix entries in COO format matrix will be sorted
+ *  (primarily by row index, secondarily by column index).
+ *
+ *  @param m Number of matrix rows.
+ *  @param n Number of matrix columns.
+ *  @param nnz Number of non-zero matrix entries.
+ *  @param cooRowInd Row indices for COO matrix.
+ *  @param cooColInd Column indices for COO matrix.
+ *  @param cooRVal Real component of COO matrix entries. Ignored if
+ *  null pointer.
+ *  @param cooIVal Imaginary component of COO matrix entries. Ignored
+ *  if null pointer.
+ *  @param csrRowPtr Row pointers for CSR matrix. Should have at least
+ *  n+1 entries.
+ *  @param csrColInd Column indices for CSR matrix (identical to
+ *  output of cooColInd). Should have at least nnz entries. Ignored if
+ *  null pointer.
+ *  @param csrRVal Real component of CSR matrix entries (identical to
+ *  output of cooRVal). Should have at least nnz entries.  Ignored if
+ *  null pointer.
+ *  @param csrIVal Imaginary component of CSR matrix entries
+ *  (identical to output of cooIVal). Should have at least nnz
+ *  entries.  Ignored if null pointer.
+ *  @return Zero if matrix was converted successfully. Otherwise
+ *  non-zero.
+ */
+template <typename IndexType_, typename ValueType_>
+int coo_to_csr(IndexType_ m, IndexType_ n, IndexType_ nnz,
+		IndexType_ * __restrict__ cooRowInd,
+		IndexType_ * __restrict__ cooColInd, 
+		ValueType_ * __restrict__ cooRVal,
+		ValueType_ * __restrict__ cooIVal,
+		IndexType_ * __restrict__ csrRowPtr,
+		IndexType_ * __restrict__ csrColInd,
+		ValueType_ * __restrict__ csrRVal,
+		ValueType_ * __restrict__ csrIVal) {
+
+  // Convert COO to CSR matrix
+  coo_sort(nnz, 0, cooRowInd, cooColInd, cooRVal, cooIVal);
+  coo_sort(nnz, 1, cooRowInd, cooColInd, cooRVal, cooIVal);
+  coo_compress(m, n, nnz, cooRowInd, csrRowPtr);
+
+  // Copy arrays
+  if(csrColInd!=NULL)
+    memcpy(csrColInd, cooColInd, nnz*sizeof(IndexType_));
+  if((cooRVal!=NULL) && (csrRVal!=NULL))
+    memcpy(csrRVal, cooRVal, nnz*sizeof(ValueType_));
+  if((cooIVal!=NULL) && (csrIVal!=NULL))
+    memcpy(csrIVal, cooIVal, nnz*sizeof(ValueType_));
+
+  return 0;
+
+}
+
diff --git a/cpp/nvgraph/cpp/include/thrust_coarse_generator.cuh b/cpp/nvgraph/cpp/include/thrust_coarse_generator.cuh
new file mode 100644
index 00000000000..7faec5ee85d
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/thrust_coarse_generator.cuh
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <thrust/device_vector.h>
+#include <thrust/system/detail/generic/reduce_by_key.h>
+#include <thrust/remove.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/gather.h>
+#include <thrust/binary_search.h>
+#include <thrust/detail/temporary_array.h>
+#include "util.cuh"
+#include "graph_utils.cuh"
+//#include <cusp/format_utils.h> //indices_to_offsets
+
+template <typename DerivedPolicy, typename IndexArray, typename OffsetArray>
+void indices_to_offsets(const thrust::execution_policy<DerivedPolicy> &exec,
+                        const IndexArray& indices, OffsetArray& offsets)
+{
+    typedef typename OffsetArray::value_type OffsetType;
+
+    // convert uncompressed row indices into compressed row offsets
+    thrust::lower_bound(exec,
+                        indices.begin(),
+                        indices.end(),
+                        thrust::counting_iterator<OffsetType>(0),
+                        thrust::counting_iterator<OffsetType>(offsets.size()),
+                        offsets.begin());
+}
+
+
+template <typename DerivedPolicy, typename ArrayType1, typename ArrayType2>
+void counting_sort_by_key(const thrust::execution_policy<DerivedPolicy> &exec,
+                          ArrayType1& keys, ArrayType2& vals//,
+                          /*typename ArrayType1::value_type min, typename ArrayType1::value_type max*/)
+{
+/*
+    std::cout<<"## stable_sort_by_key\n" ;
+    if(keys.size()!= vals.size()){
+          std::cout<<"Error keys.size()!= vals.size()\n" ;
+    }
+*/
+    CUDA_CALL(cudaDeviceSynchronize());
+    thrust::stable_sort_by_key(exec, keys.begin(), keys.end(), vals.begin());
+    CUDA_CALL(cudaDeviceSynchronize());
+//    std::cout<<"## done stable_sort_by_key\n";
+}
+
+
+template <typename DerivedPolicy, typename ArrayType1, typename ArrayType2, typename ArrayType3>
+void sort_by_row_and_column(const thrust::execution_policy<DerivedPolicy> &exec,
+                            ArrayType1& row_indices, ArrayType2& column_indices, ArrayType3& values,
+                            typename ArrayType1::value_type min_row = 0,
+                            typename ArrayType1::value_type max_row = 0,
+                            typename ArrayType2::value_type min_col = 0,
+                            typename ArrayType2::value_type max_col = 0)
+{
+    typedef typename ArrayType1::value_type IndexType1;
+    typedef typename ArrayType2::value_type IndexType2;
+    typedef typename ArrayType3::value_type ValueType;
+
+    size_t N = row_indices.size();
+
+    
+    thrust::detail::temporary_array<IndexType1, DerivedPolicy> permutation(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), N);
+    thrust::sequence(exec, permutation.begin(), permutation.end());
+
+/*
+    IndexType1 minr = min_row;
+    IndexType1 maxr = max_row;
+    IndexType2 minc = min_col;
+    IndexType2 maxc = max_col;
+*/
+    //std::cout<<"## max element\n";
+
+/*
+    if(maxr == 0){
+//        maxr = *thrust::max_element(exec, row_indices.begin(), row_indices.end());
+        ArrayType1::iterator maxr_iter = thrust::max_element(exec, row_indices.begin(), row_indices.end());
+        maxr = *maxr_ptr;
+    }
+    if(maxc == 0){
+//        maxc = *thrust::max_element(exec, column_indices.begin(), column_indices.end());
+        ArrayType2::iterator maxc_iter = thrust::max_element(exec, column_indices.begin(), column_indices.end());
+        thrust::copy()
+        maxc = *maxc_ptr;
+    }
+*/
+//    std::cout<<"## compute permutation and sort by (I,J)\n";
+    // compute permutation and sort by (I,J)
+    {
+        thrust::detail::temporary_array<IndexType1, DerivedPolicy> temp(thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+                                                                        column_indices.begin(), column_indices.end());
+        counting_sort_by_key(exec, temp, permutation/*, minc, maxc*/);
+  
+        thrust::copy(exec, row_indices.begin(), row_indices.end(), temp.begin());
+
+        thrust::gather(exec, permutation.begin(), permutation.end(), temp.begin(), row_indices.begin());
+        counting_sort_by_key(exec, row_indices, permutation/*, minr, maxr*/);
+//        thrust::stable_sort_by_key(exec, row_indices.begin(), row_indices.end(), permutation.begin());
+
+        thrust::copy(exec, column_indices.begin(), column_indices.end(), temp.begin());
+        thrust::gather(exec, permutation.begin(), permutation.end(), temp.begin(), column_indices.begin());
+
+    }
+    // use permutation to reorder the values
+    {
+        thrust::detail::temporary_array<ValueType, DerivedPolicy> temp(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), 
+                                                                       values.begin(), values.end()); 
+        thrust::gather(exec, permutation.begin(), permutation.end(), temp.begin(), values.begin());
+    }
+}
+
+//#include <cusp/system/detail/generic/format_utils.h>
+// --------------------
+// Kernels
+// --------------------
+
+// Kernel to store aggregate I of each fine point index i
+template <typename IndexType>
+__global__
+void iToIKernel(const IndexType *row_offsets, const IndexType *aggregates, IndexType *I, const int num_rows)
+{  
+  for (int tid = blockDim.x*blockIdx.x + threadIdx.x; tid < num_rows; tid += gridDim.x * blockDim.x)
+  {
+    int agg = aggregates[tid];
+    for (int j=row_offsets[tid];j<row_offsets[tid+1];j++)
+    {
+        I[j] = agg;        
+    }
+  }
+}
+
+// Kernel to store aggregate J of each fine point index j
+template <typename IndexType>
+__global__
+void jToJKernel(const IndexType *column_indices, const IndexType *aggregates, IndexType *J, const int num_entries)
+{
+  for (int tid = blockDim.x*blockIdx.x + threadIdx.x; tid < num_entries; tid += gridDim.x * blockDim.x)
+  {
+    int j = column_indices[tid];
+    J[tid] = aggregates[j];
+  }
+}
+
+//-----------------------------------------------------
+// Method to compute the Galerkin product: A_c=R*A*P 
+//-----------------------------------------------------
+
+// Method to compute Ac on DEVICE using csr format
+template <typename IndexType, typename ValueType>
+void generate_superverticies_graph(const int n_vertex, const int num_aggregates, 
+                                   thrust::device_vector<IndexType> &csr_ptr_d, 
+                                   thrust::device_vector<IndexType> &csr_ind_d,
+                                   thrust::device_vector<ValueType> &csr_val_d,
+                                   thrust::device_vector<IndexType> &new_csr_ptr_d, 
+                                   thrust::device_vector<IndexType> &new_csr_ind_d,
+                                   thrust::device_vector<ValueType> &new_csr_val_d,
+                                   const thrust::device_vector<IndexType> &aggregates  
+                                   ){
+  
+  const int n_edges = csr_ptr_d[n_vertex];
+
+  
+  thrust::device_vector<IndexType> I(n_edges,-1);
+  thrust::device_vector<IndexType> J(n_edges,-1);
+  thrust::device_vector<ValueType> V(n_edges,-1);
+
+  const int block_size_I = 128;
+  const int block_size_J = 256;
+
+  const int num_blocks_I = min( GRID_MAX_SIZE, (int) ((n_vertex-1)/block_size_I + 1) );
+  const int num_blocks_J = min( GRID_MAX_SIZE, (int) ((n_edges-1)/block_size_J + 1) );
+
+  const IndexType *row_offsets_ptr = thrust::raw_pointer_cast(csr_ptr_d.data());
+  const IndexType *column_indices_ptr = thrust::raw_pointer_cast(csr_ind_d.data());
+  const IndexType *aggregates_ptr= thrust::raw_pointer_cast(aggregates.data());
+  IndexType *I_ptr= thrust::raw_pointer_cast(&I[0]);
+  IndexType *J_ptr= thrust::raw_pointer_cast(&J[0]);
+
+
+
+
+  // Kernel to fill array I with aggregates number for fine points i
+  iToIKernel<<<num_blocks_I,block_size_I>>>(row_offsets_ptr, aggregates_ptr, I_ptr, (int)n_vertex);
+  cudaCheckError();
+
+  // Kernel to fill array J with aggregates number for fine points j
+  jToJKernel<<<num_blocks_J,block_size_J>>>(column_indices_ptr, aggregates_ptr, J_ptr, (int)n_edges);
+  cudaCheckError();
+
+  // Copy A.values to V array
+  thrust::copy(thrust::device, csr_val_d.begin(), csr_val_d.begin() + n_edges, V.begin()); 
+  cudaCheckError();
+  //cudaDeviceSynchronize();
+  
+
+  // Sort (I,J,V) by rows and columns (I,J)
+  // TODO : remove cusp depedency
+  sort_by_row_and_column(thrust::device, I, J, V);
+  cudaCheckError();
+
+  cudaDeviceSynchronize();
+
+  // compute unique number of nonzeros in the output
+  IndexType NNZ = thrust::inner_product(thrust::make_zip_iterator(thrust::make_tuple(I.begin(), J.begin())),
+                                        thrust::make_zip_iterator(thrust::make_tuple(I.end (),  J.end()))   - 1,
+                                        thrust::make_zip_iterator(thrust::make_tuple(I.begin(), J.begin())) + 1,
+                                        IndexType(0),
+                                        thrust::plus<IndexType>(),
+                                        thrust::not_equal_to< thrust::tuple<IndexType,IndexType> >()) + 1;
+  cudaCheckError();
+
+  // allocate space for coarse matrix Ac
+  new_csr_ptr_d.resize(num_aggregates+1);
+  new_csr_ind_d.resize(NNZ);
+  new_csr_val_d.resize(NNZ);
+
+
+  // Reduce by key to fill in Ac.column_indices and Ac.values
+  thrust::device_vector<IndexType> new_row_indices(NNZ,0);
+
+
+  thrust::reduce_by_key(thrust::make_zip_iterator(thrust::make_tuple(I.begin(), J.begin())),
+         thrust::make_zip_iterator(thrust::make_tuple(I.end(),   J.end())),
+         V.begin(),
+         thrust::make_zip_iterator(thrust::make_tuple(new_row_indices.begin(), new_csr_ind_d.begin())),
+         new_csr_val_d.begin(),
+         thrust::equal_to< thrust::tuple<IndexType,IndexType> >(),
+         thrust::plus<ValueType>());
+  cudaCheckError();
+  
+  indices_to_offsets(thrust::device, new_row_indices, new_csr_ptr_d);
+  cudaCheckError();
+
+}
+
diff --git a/cpp/nvgraph/cpp/include/thrust_traits.hxx b/cpp/nvgraph/cpp/include/thrust_traits.hxx
new file mode 100644
index 00000000000..922d680474d
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/thrust_traits.hxx
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+
+#ifndef THRUST_TRAITS_HXX
+
+#define THRUST_TRAITS_HXX
+
+
+
+#include <thrust/device_vector.h>
+
+#include <thrust/host_vector.h>
+
+
+
+namespace nvgraph
+
+{
+
+  //generic Vector Ptr Type facade:
+
+  //
+
+  template<typename T, typename Vector>
+
+  struct VectorPtrT;
+
+
+
+  //partial specialization for device_vector:
+
+  //
+
+  template<typename T>
+
+  struct VectorPtrT<T, thrust::device_vector<T> >
+
+  {
+
+    typedef thrust::device_ptr<T> PtrT;
+
+  };
+
+
+
+  //partial specialization for host_vector:
+
+  //
+
+  template<typename T>
+
+  struct VectorPtrT<T, thrust::host_vector<T> >
+
+  {
+
+    typedef typename thrust::host_vector<T>::value_type* PtrT;
+
+  };
+
+}
+
+#endif
+
diff --git a/cpp/nvgraph/cpp/include/triangles_counting.hxx b/cpp/nvgraph/cpp/include/triangles_counting.hxx
new file mode 100644
index 00000000000..360996afbef
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/triangles_counting.hxx
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+#pragma once
+
+#include <csr_graph.hxx>
+#include <async_event.hxx>
+#include <nvgraph_error.hxx>
+#include <nvgraph_vector.hxx>
+
+#include <cuda_runtime.h>
+
+#include <triangles_counting_defines.hxx>
+
+namespace nvgraph
+{
+
+namespace triangles_counting
+{
+
+
+typedef enum { TCOUNT_DEFAULT, TCOUNT_BSH, TCOUNT_B2B, TCOUNT_WRP, TCOUNT_THR } TrianglesCountAlgo;
+
+
+template <typename IndexType>
+class TrianglesCount 
+{
+private:
+    //CsrGraph <IndexType>& m_last_graph ;
+    AsyncEvent          m_event;
+    uint64_t            m_triangles_number;
+    spmat_t<IndexType>  m_mat;
+    int                 m_dev_id;
+    cudaDeviceProp      m_dev_props;
+
+    Vector<IndexType>   m_seq;
+
+    cudaStream_t        m_stream;
+
+    bool m_done;
+
+    void tcount_bsh();
+    void tcount_b2b();
+    void tcount_wrp();
+    void tcount_thr();
+
+public:
+    // Simple constructor 
+    TrianglesCount(const CsrGraph <IndexType>& graph, cudaStream_t stream = NULL, int device_id = -1);
+    // Simple destructor
+    ~TrianglesCount();
+
+    NVGRAPH_ERROR count(TrianglesCountAlgo algo = TCOUNT_DEFAULT );
+    inline uint64_t get_triangles_count() const {return m_triangles_number;}
+};
+
+} // end namespace triangles_counting
+
+} // end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/include/triangles_counting_defines.hxx b/cpp/nvgraph/cpp/include/triangles_counting_defines.hxx
new file mode 100644
index 00000000000..28ced20a4cd
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/triangles_counting_defines.hxx
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <limits.h>
+
+#ifdef _MSC_VER
+#include <stdint.h>
+#else
+#include <inttypes.h>
+#endif
+
+
+/*
+#ifdef MSVC_VER
+#include <intrin.h> 
+#pragma intrinsic(_BitScanForward) 
+#pragma intrinsic(_BitScanForward64) 
+#pragma intrinsic(_BitScanReverse) 
+#pragma intrinsic(_BitScanReverse64) 
+#endif
+*/
+
+#define MIN(x,y) (((x)<(y))?(x):(y))
+#define MAX(x,y) (((x)>(y))?(x):(y))
+
+#define THREADS        (128)
+#define DIV_UP(a,b)    (((a)+((b)-1))/(b))
+#define BITSOF(x)    (sizeof(*x)*8)
+
+#define BLK_BWL0 (128)
+#define WRP_BWL0 (128)
+
+#define HUGE_GRAPH
+
+#define DEG_THR1  (3.5) 
+#define DEG_THR2 (38.0) 
+
+namespace nvgraph
+{
+
+namespace triangles_counting
+{
+
+template <typename T> struct type_utils;
+
+template <>
+struct type_utils<int>
+{
+    typedef int  LOCINT;
+    static const LOCINT LOCINT_MAX = INT_MAX;
+#ifdef MPI_VERSION
+    static const MPI_Datatype LOCINT_MPI = MPI_INT;
+#endif
+    static __inline__ LOCINT abs(const LOCINT& x)
+    {
+        return abs(x);
+    }
+};
+
+template <>
+struct type_utils<int64_t>
+{
+    typedef uint64_t  LOCINT;
+    static const LOCINT LOCINT_MAX = LLONG_MAX;
+#ifdef MPI_VERSION
+    static const MPI_Datatype LOCINT_MPI = MPI_LONG_LONG;
+#endif
+
+    static __inline__ LOCINT abs(const LOCINT& x)
+    {
+        return llabs(x);
+    }
+};
+
+
+template <typename T>
+struct spmat_t {
+    T    N;
+    T    nnz;
+    T    nrows;
+    const T    *roff_d;
+    const T    *rows_d;
+    const T    *cols_d;
+    bool is_lower_triangular;
+};
+
+} // namespace triangles_counting
+
+} // namespace nvgraph
diff --git a/cpp/nvgraph/cpp/include/triangles_counting_kernels.hxx b/cpp/nvgraph/cpp/include/triangles_counting_kernels.hxx
new file mode 100644
index 00000000000..4574073ce35
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/triangles_counting_kernels.hxx
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <triangles_counting.hxx>
+
+namespace nvgraph
+{
+
+namespace triangles_counting
+{
+
+template <typename T>
+void tricnt_bsh(T nblock, spmat_t<T> *m, uint64_t *ocnt_d, size_t bmld, cudaStream_t stream);
+template <typename T>
+void tricnt_wrp(T nblock, spmat_t<T> *m, uint64_t *ocnt_d, unsigned int *bmap_d, size_t bmld, cudaStream_t stream);
+template <typename T>
+void tricnt_thr(T nblock, spmat_t<T> *m, uint64_t *ocnt_d, cudaStream_t stream);
+template <typename T>
+void tricnt_b2b(T nblock, spmat_t<T> *m, uint64_t *ocnt_d, unsigned int *bmapL0_d, size_t bmldL0, unsigned int *bmapL1_d, size_t bmldL1, cudaStream_t stream);
+
+template <typename T>
+uint64_t reduce(uint64_t *v_d, T n, cudaStream_t stream);
+template <typename T>
+void create_nondangling_vector(const T *roff, T *p_nonempty, T *n_nonempty, size_t n, cudaStream_t stream);
+
+void myCudaMemset(unsigned long long *p, unsigned long long v, long long n, cudaStream_t stream);
+
+} // namespace triangles_counting
+
+} // namespace nvgraph
diff --git a/cpp/nvgraph/cpp/include/util.cuh b/cpp/nvgraph/cpp/include/util.cuh
new file mode 100644
index 00000000000..24b3e281821
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/util.cuh
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <iostream>
+#include <fstream>
+#include <ctime>
+#include <chrono>
+#include <string>
+#include <time.h>
+
+namespace nvlouvain{
+
+#define BLOCK_SIZE_1D 64
+#define BLOCK_SIZE_2D 16
+#define CUDA_MAX_KERNEL_THREADS 256
+#define CUDA_MAX_BLOCKS_1D 65535
+#define CUDA_MAX_BLOCKS_2D 256
+#define LOCAL_MEM_MAX 512
+#define GRID_MAX_SIZE 65535
+#define WARP_SIZE 32
+
+#define CUDA_CALL( call )                                                                         \
+{                                                                                                 \
+  cudaError_t cudaStatus = call;                                                                  \
+  if ( cudaSuccess != cudaStatus )                                                                \
+    fprintf(stderr, "ERROR: CUDA call \"%s\" in line %d of file %s failed with %s (%d).\n",       \
+                        #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus);   \
+}
+
+#define THRUST_SAFE_CALL( call )                                                                  \
+{                                                                                                 \
+  try{                                                                                            \
+    call;                                                                                         \
+  }                                                                                               \
+  catch(std::bad_alloc &e){                                                                       \
+    fprintf(stderr, "ERROR: THRUST call \"%s\".\n"                                                \
+                      #call);                                                                     \
+    exit(-1);                                                                                     \
+  }                                                                                               \
+} 
+
+#define COLOR_GRN "\033[0;32m"
+#define COLOR_MGT "\033[0;35m"
+#define COLOR_WHT "\033[0;0m"
+
+inline std::string time_now(){ 
+  struct timespec ts;
+  timespec_get(&ts, TIME_UTC);
+  char buff[100];
+  strftime(buff, sizeof buff, "%T", gmtime(&ts.tv_sec));
+  std::string s = buff;
+  s +="."+std::to_string(ts.tv_nsec).substr(0, 6);
+
+  return s;
+}
+
+typedef enum{
+  NVLOUVAIN_OK = 0,
+  NVLOUVAIN_ERR_BAD_PARAMETERS = 1,
+}NVLOUVAIN_STATUS;
+
+using nvlouvainStatus_t = NVLOUVAIN_STATUS;
+
+const char* nvlouvainStatusGetString(nvlouvainStatus_t status){
+  std::string s;
+  switch(status){
+    case 0:
+      s = "NVLOUVAIN_OK";
+    break;
+    case 1:
+      s = "NVLOUVAIN_ERR_BAD_PARAMETERS";
+    break;
+    default:
+    break;
+  }
+  return s.c_str();
+}
+
+template<typename VecType> 
+void display_vec(VecType vec, std::ostream& ouf=std::cout){
+  auto it = vec.begin();
+  ouf<<vec.front();
+  for(it = vec.begin() + 1; it!= vec.end(); ++it) {
+    ouf<<", "<<(*it);
+  }
+  ouf<<"\n";
+}
+
+template<typename VecType> 
+void display_intvec_size(VecType vec, unsigned size){
+  printf("%d", (int)vec[0]);
+  for(unsigned i = 1; i < size; ++i) {
+    printf(", %d",(int)vec[i]);
+  }
+  printf("\n");
+}
+
+
+template<typename VecType> 
+void display_vec_size(VecType vec, unsigned size){
+  for(unsigned i = 0; i < size; ++i) {
+    printf("%f ",vec[i]);
+  }
+  printf("\n");
+}
+
+template<typename VecIter> 
+__host__ __device__ void display_vec(VecIter vec, int size){
+  
+  for(unsigned i = 0; i < size; ++i) {
+    printf("%f ", (*(vec+i)));
+  }
+  printf("\n");
+}
+
+
+template<typename VecType> 
+__host__ __device__ void display_vec_with_idx(VecType vec, int size, int offset=0){
+  
+  for(unsigned i = 0; i < size; ++i) {
+    printf("idx:%d %f\n", i+offset, (*(vec+i)));
+  }
+  printf("\n");
+}
+
+template<typename VecType> 
+void display_cluster(std::vector<VecType>& vec, std::ostream& ouf=std::cout){
+  
+  for(const auto& it: vec){
+    for(unsigned idx = 0; idx <it.size(); ++idx ){
+      ouf<<idx<<" "<<it[idx]<<std::endl;
+    }
+  }
+}
+
+template<typename VecType>
+int folded_print_float(VecType s){
+  return printf("%f\n", s);
+}
+
+template<typename VecType1, typename ... VecType2>
+int folded_print_float(VecType1 s, VecType2 ... vec){
+  return printf("%f ", s) + folded_print_float(vec...);
+}
+
+
+template<typename VecType>
+int folded_print_int(VecType s){
+  return printf("%d\n", (int)s);
+}
+
+template<typename VecType1, typename ... VecType2>
+int folded_print_int(VecType1 s, VecType2 ... vec){
+  return printf("%d ", (int)s) + folded_print_int(vec...);
+}
+
+}//nvlouvain
diff --git a/cpp/nvgraph/cpp/include/valued_csr_graph.cuh b/cpp/nvgraph/cpp/include/valued_csr_graph.cuh
new file mode 100644
index 00000000000..81e0e517f06
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/valued_csr_graph.cuh
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace nvlouvain{
+
+
+template <typename ValType>
+class Vector: public thrust::device_vector<ValType>{
+  public:
+    Vector(): thrust::device_vector<ValType>(){}
+    Vector(int size): thrust::device_vector<ValType>(size){}
+ 
+    template <typename Iter> 
+    Vector(Iter begin, Iter end): thrust::device_vector<ValType>(begin, end){}
+ 
+    inline void fill(const ValType val){
+      thrust::fill(thrust::cuda::par, this->begin(), this->end(), val);
+    }
+    inline thrust::device_vector<ValType>& to_device_vector(){
+      return static_cast<thrust::device_vector<ValType>> (*this);
+    }
+
+    inline ValType* raw(){
+      return (ValType*)thrust::raw_pointer_cast( thrust::device_vector<ValType>::data() );
+    }
+
+    inline int get_size(){
+      return this->size();
+    }
+};
+
+
+template <typename IndexType, typename ValueType>
+class CsrGraph{
+     
+  public:
+    CsrGraph( thrust::device_vector<IndexType>& csr_ptr_d, thrust::device_vector<IndexType>& csr_ind_d,  thrust::device_vector<ValueType>& csr_val_d, IndexType v, IndexType e, bool _w=false):
+    _n_vertices(v), _n_edges(e), csr_ptr(csr_ptr_d.begin(), csr_ptr_d.end()), csr_ind(csr_ind_d.begin(), csr_ind_d.end()), csr_val(csr_val_d.begin(), csr_val_d.end()), weighted(_w){
+    }
+    
+    CsrGraph( thrust::host_vector<IndexType>& csr_ptr_d, thrust::host_vector<IndexType>& csr_ind_d,  thrust::host_vector<ValueType>& csr_val_d, IndexType v, IndexType e, bool _w=false):
+    _n_vertices(v), _n_edges(e), csr_ptr(csr_ptr_d.begin(), csr_ptr_d.end()), csr_ind(csr_ind_d.begin(), csr_ind_d.end()), csr_val(csr_val_d.begin(), csr_val_d.end()), weighted(_w){
+    }
+
+
+    inline const IndexType get_num_vertices() const{
+      return _n_vertices;
+    }
+
+    inline const IndexType get_num_edges() const{
+      return csr_ptr.back();
+    } 
+    inline const IndexType* get_raw_row_offsets() const{
+      return thrust::raw_pointer_cast(csr_ptr.data());
+    }
+    inline const IndexType* get_raw_column_indices()const {
+      return thrust::raw_pointer_cast(csr_ind.data());;
+    }
+    inline const ValueType* get_raw_values() const{
+      return thrust::raw_pointer_cast(csr_val.data());
+    }
+    inline const Vector<IndexType> & get_row_offsets() const{
+      return csr_ptr;
+    }
+    inline const Vector<IndexType> & get_column_indices() const{
+      return csr_ind;
+    }
+    inline const Vector<ValueType> & get_values() const{
+      return csr_val;
+    }
+    inline const Vector<IndexType> & get_csr_ptr() const{
+      return csr_ptr;
+    }
+    inline const Vector<IndexType> & get_csr_ind() const{
+      return csr_ind;
+    }
+    inline const Vector<ValueType> & get_csr_val() const{
+      return csr_val;
+    }
+ 
+    inline void update_csr_ptr(thrust::device_vector<IndexType> & d_v){
+      thrust::copy(thrust::cuda::par, d_v.begin(), d_v.end(), csr_ptr.begin());
+    }
+    inline void update_csr_ptr_n(thrust::device_vector<IndexType> & d_v,unsigned size){
+      csr_ptr.resize(size);
+      thrust::copy_n(thrust::cuda::par, d_v.begin(), size, csr_ptr.begin());
+    } 
+
+
+    inline void update_csr_ind(thrust::device_vector<IndexType> & d_v){
+      thrust::copy(thrust::cuda::par, d_v.begin(), d_v.end(), csr_ind.begin());
+    }
+    inline void update_csr_ind_n(thrust::device_vector<IndexType> & d_v,unsigned size){
+      csr_ind.resize(size);
+      thrust::copy_n(thrust::cuda::par, d_v.begin(), size, csr_ind.begin());
+    } 
+
+
+    inline void update_csr_val(thrust::device_vector<ValueType> & d_v){
+      thrust::copy(thrust::cuda::par, d_v.begin(), d_v.end(), csr_val.begin());
+    }  
+    inline void update_csr_val_n(thrust::device_vector<ValueType> & d_v,unsigned size){
+      csr_val.resize(size); 
+      thrust::copy_n(thrust::cuda::par, d_v.begin(), size, csr_val.begin());
+    } 
+    inline void update_graph(size_t n_v, size_t n_e, thrust::device_vector<IndexType> & ptr, thrust::device_vector<IndexType> & ind, thrust::device_vector<ValueType> & val, bool w){
+      _n_vertices = n_v;
+      _n_edges = n_e;
+#ifdef DEBUG
+      if(n_v != ptr.size()){
+        std::cout<<"n_vertex size not match\n";
+      }
+      if(n_e != ind.size() || n_e != val.size()){
+        std::cout<<"n_edges size not match\n";
+      }
+#endif 
+      update_csr_ptr_n(ptr, _n_vertices);
+      update_csr_ind_n(ind, _n_edges);
+      update_csr_val_n(val, _n_edges);
+      weighted = w;
+    }
+  private:
+    size_t _n_vertices;
+    size_t _n_edges;
+    Vector<IndexType> csr_ptr;
+    Vector<IndexType> csr_ind;
+    Vector<ValueType> csr_val;
+    bool weighted;
+};
+
+
+
+
+}; //nvlouvain
diff --git a/cpp/nvgraph/cpp/include/valued_csr_graph.hxx b/cpp/nvgraph/cpp/include/valued_csr_graph.hxx
new file mode 100644
index 00000000000..5fe1986c449
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/valued_csr_graph.hxx
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "csr_graph.hxx"
+#include "nvgraph_vector.hxx"
+
+namespace nvgraph
+{
+
+/*! A ValuedCsrGraph is a graph strored in a CSR data structure.
+    It represents an weighted graph and has storage for row_offsets and column_indices and values
+ */
+template <typename IndexType_, typename ValueType_>
+class ValuedCsrGraph : public nvgraph::CsrGraph<IndexType_>
+{
+public:
+    typedef IndexType_ IndexType;
+    typedef ValueType_ ValueType;
+
+private:
+    typedef nvgraph::CsrGraph<IndexType> Parent;
+
+protected:
+    /*! Storage for the nonzero entries of the CSR data structure.
+     */
+    SHARED_PREFIX::shared_ptr<ValueType> values;
+
+public:  
+
+    /*! Construct an empty \p ValuedCsrGraph.
+     */
+    ValuedCsrGraph(void) {}
+    /*! Destruct a \p ValuedCsrGraph.
+     */
+    ~ValuedCsrGraph(void) {}
+
+    /*! Construct a \p ValuedCsrGraph with a specific shape and number of nonzero entries.
+     *
+     *  \param num_rows Number of rows.
+     *  \param num_entries Number of nonzero graph entries.
+     */
+    ValuedCsrGraph(size_t num_rows, size_t num_entries, cudaStream_t stream)
+        : Parent(num_rows, num_entries, stream),
+          values(allocateDevice<ValueType>(num_entries, NULL)) {}
+
+    /*! Construct a \p ValuedCsrGraph from another graph.
+     *
+     *  \param ValuedCsrGraph Another graph in csr
+     */
+    ValuedCsrGraph(const ValuedCsrGraph& gr): 
+        Parent(gr),
+        values(gr.values)
+    {}
+
+    /*! Construct a \p ValuedCsrGraph from another graph.  
+     *
+     *  \param ValuedCsrGraph Another graph in csr
+     */
+    ValuedCsrGraph(const Parent& gr, Vector<ValueType>& vals):
+        Parent(gr),  
+        values(vals.raw())      
+    {
+
+    }
+
+    inline ValueType* get_raw_values()  const { return values.get(); }
+
+
+    /*! Swap the contents of two \p ValuedCsrGraph objects.
+     *
+     *  \param graph Another graph in csr 
+     */
+    void swap(ValuedCsrGraph& graph);
+
+    /*! Assignment from another graph.
+     *
+     *  \param graph Another graph in csr
+     */
+    ValuedCsrGraph& operator=(const ValuedCsrGraph& graph);
+
+    //Accept method injection
+    DEFINE_VISITABLE(IndexType_)
+
+}; // class ValuedCsrGraph
+}
+
diff --git a/cpp/nvgraph/cpp/include/widest_path.hxx b/cpp/nvgraph/cpp/include/widest_path.hxx
new file mode 100644
index 00000000000..317da2cd8c1
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/widest_path.hxx
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+#pragma once
+namespace nvgraph
+{
+template <typename IndexType_, typename ValueType_>
+class WidestPath 
+{
+public: 
+    typedef IndexType_ IndexType;
+    typedef ValueType_ ValueType;
+private:
+    ValuedCsrGraph <IndexType, ValueType> m_network ;
+    Vector <ValueType> m_widest_path;
+    Vector <ValueType> m_tmp;
+    Vector <int> m_mask; // mask[i] = 0 if we can ignore the i th column in the csrmv
+    IndexType m_source;
+    ValueType m_residual;
+    int m_iterations;
+    bool m_is_setup;
+    cudaStream_t m_stream;
+    bool solve_it();
+    void setup(IndexType source_index, Vector<ValueType>& source_connection,  Vector<ValueType>& WidestPath_result);
+public:
+    // Simple constructor 
+    WidestPath(void) {};
+    // Simple destructor
+    ~WidestPath(void) {};
+    // Create a WidestPath solver attached to a the transposed of a  weighted network
+    // *** network is the transposed/CSC*** 
+    WidestPath(const ValuedCsrGraph <IndexType, ValueType>& network, cudaStream_t stream = 0):m_network(network),m_is_setup(false), m_stream(stream)  {};
+   
+    /*! Find the Widest Path from  the vertex source_index to every other vertices.
+     *
+     *  \param source_index The source. 
+     *  \param source_connection The connectivity of the source
+     *                                                  - if there is a link from source_index to i, source_connection[i] =  E(source_index, i) ) 
+     *                                                  - otherwise  source_connection[i] = op.plus->id
+     *                                                  - source_connection[source_index] = op.time->id
+                                                        The source_connection is provided as input
+     *  \param (output) m_widest_path  m_widest_path[i] contains the Widest Path from  the source to the vertex i.
+     */
+     
+    NVGRAPH_ERROR solve(IndexType source_index, Vector<ValueType>& source_connection, Vector<ValueType>& WidestPath_result);
+    inline int get_iterations() const {return m_iterations;}
+};
+} // end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/src/arnoldi.cu b/cpp/nvgraph/cpp/src/arnoldi.cu
new file mode 100644
index 00000000000..8975b985f83
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/arnoldi.cu
@@ -0,0 +1,1245 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <iomanip>
+#include <utility>
+#include <curand.h>
+
+#include "valued_csr_graph.hxx"
+#include "nvgraph_vector.hxx"
+#include "nvgraph_vector_kernels.hxx"
+#include "nvgraph_cusparse.hxx"
+#include "nvgraph_cublas.hxx"
+#include "nvgraph_lapack.hxx"
+#include "nvgraph_error.hxx"
+#include "pagerank_kernels.hxx"
+#include "arnoldi.hxx"
+#include "nvgraph_csrmv.hxx"
+#include "matrix.hxx"
+
+
+#include "debug_macros.h"
+#ifdef DEBUG
+#define IRAM_VERBOSE
+// #define IRAM_DEBUG
+#endif
+
+namespace nvgraph
+{
+
+template <typename IndexType_, typename ValueType_>
+ImplicitArnoldi<IndexType_, ValueType_>::ImplicitArnoldi(const ValuedCsrGraph <IndexType, ValueType>& A)
+    :m_A(A), m_markov(false), m_laplacian(false), m_tolerance(1.0E-12), m_iterations(0), m_dirty_bit(false), m_max_iter(500), has_init_guess(false)
+{
+//     initialize cuda libs outside of the solve (this is slow)
+//    cusparseHandle_t t1 = Cusparse::get_handle();
+//    cublasHandle_t t2 = Cublas::get_handle();
+
+//  compiler is complainig, unused variables
+    Cusparse::get_handle();
+    Cublas::get_handle();
+}
+
+template <typename IndexType_, typename ValueType_>
+ImplicitArnoldi<IndexType_, ValueType_>::ImplicitArnoldi(const ValuedCsrGraph <IndexType, ValueType>& A, int parts)
+    :m_A(A), m_parts(parts), m_laplacian(true), m_markov(false), m_tolerance(1.0E-9), m_iterations(0), m_dirty_bit(false), m_max_iter(500), has_init_guess(false)
+{
+//     initialize cuda libs outside of the solve (this is slow)
+//    cusparseHandle_t t1 = Cusparse::get_handle();
+//    cublasHandle_t t2 = Cublas::get_handle();
+
+//  compiler is complainig, unused variables
+    Cusparse::get_handle();
+    Cublas::get_handle();
+}
+
+template <typename IndexType_, typename ValueType_>
+ImplicitArnoldi<IndexType_, ValueType_>::ImplicitArnoldi(const ValuedCsrGraph <IndexType, ValueType>& A, Vector<ValueType>& dangling_nodes, const float tolerance, const int max_iter, ValueType alpha)
+    :m_A(A),  m_a(dangling_nodes), m_damping(alpha), m_markov(true), m_laplacian(false), m_tolerance(tolerance), m_iterations(0), m_dirty_bit(false), m_max_iter(max_iter), has_init_guess(false)
+{
+//     initialize cuda libs outside of the solve (this is slow)
+//    cusparseHandle_t t1 = Cusparse::get_handle();
+//    cublasHandle_t t2 = Cublas::get_handle();
+
+//  compiler is complainig, unused variables
+    Cusparse::get_handle();
+    Cublas::get_handle();
+}
+
+template <typename IndexType_, typename ValueType_>
+NVGRAPH_ERROR ImplicitArnoldi<IndexType_, ValueType_>::solve(const int restart_it, const int nEigVals,
+                                                          Vector<ValueType>& initial_guess,
+                                                          Vector<ValueType>& eigVals,
+                                                          Vector<ValueType>& eigVecs,
+                                                          const int nested_subspaces_freq)
+{
+    //try {
+    #ifdef IRAM_VERBOSE
+        std::stringstream ss;
+        ss.str(std::string());
+        size_t used_mem, free_mem, total_mem;
+        ss <<" ------------------ImplicitArnoldi------------------"<< std::endl;
+        ss <<" --------------------------------------------"<< std::endl;
+        ss << std::setw(10) << "Iteration" << std::setw(20) << " Mem Usage (MB)" << std::setw(15) << "Residual" << std::endl;
+        ss <<" --------------------------------------------"<< std::endl;
+        COUT()<<ss.str();
+        // start timer
+        cuda_timer timer;
+        timer.start();
+    #endif
+    m_nested_subspaces_freq = nested_subspaces_freq;
+
+    setup(initial_guess, restart_it, nEigVals);
+    m_eigenvectors = eigVecs;
+    bool converged = false;
+    int i = 0;
+    // we can print stats after setup to have the initial residual
+    #ifdef IRAM_VERBOSE
+            ss.str(std::string());
+            cnmemMemGetInfo(&free_mem, &total_mem, NULL);
+            used_mem=total_mem-free_mem;
+            ss << std::setw(10) << i ;
+            ss.precision(3);
+            ss << std::setw(20) << std::fixed << used_mem/1024.0/1024.0;
+            ss << std::setw(15) << std::scientific << m_residual;
+           if (m_miramns)  ss << "  (Krylov size: " << m_select << ")";
+            ss << std::endl;
+            COUT()<<ss.str();
+    #endif
+    while (!converged && i< m_max_iter)
+    {
+        // re-add the extra eigenvalue in case QR step changed it.
+        m_n_eigenvalues = m_nr_eigenvalues+1; 
+        converged = solve_it();
+        i++;
+         #ifdef IRAM_VERBOSE
+            ss.str(std::string());
+            cnmemMemGetInfo(&free_mem, &total_mem, NULL);
+            used_mem=total_mem-free_mem;
+            ss << std::setw(10) << i ;
+            ss.precision(3);
+            ss << std::setw(20) << std::fixed << used_mem/1024.0/1024.0;
+            ss << std::setw(15) << std::scientific << m_residual;
+            if (m_miramns)  ss << "  (Krylov size: " << m_select << ")";
+            ss << std::endl;
+            COUT()<<ss.str();
+        #endif
+    }
+    m_iterations = i;
+    if (!m_miramns)
+    { 
+        if (m_laplacian)
+        {
+            SR(m_krylov_size); 
+        }
+        else if  (m_markov)
+        {
+             LR(m_select); 
+        }
+        else
+        {
+            LM(m_krylov_size); 
+        }
+     }
+    compute_eigenvectors();
+    cudaMemcpyAsync(eigVals.raw(), &m_ritz_eigenvalues[0], (size_t)(m_nr_eigenvalues*sizeof(m_ritz_eigenvalues[0])), cudaMemcpyHostToDevice);
+    cudaCheckError();
+    #ifdef IRAM_VERBOSE
+        COUT() <<" --------------------------------------------"<< std::endl;
+        //stop timer
+        COUT() <<" Total Time : "<< timer.stop() << "ms"<<std::endl;
+        COUT() <<" --------------------------------------------"<< std::endl;
+
+       //for(int i = 0; i<m_nr_eigenvalues; i++)
+       //{
+       //     COUT() << m_ritz_eigenvalues[i];
+       //     if (m_ritz_eigenvalues_i[i])
+       //         COUT() << " " <<m_ritz_eigenvalues_i[i]<<std::endl;
+       //     else
+       //         COUT() <<std::endl;
+       //}
+
+
+    #endif
+    // } catch (const std::exception &exc) {std::cout << exc.what();}
+    // x = m_x; // sometime there is a mixup between pointers, need to investigate that.
+    return NVGRAPH_OK;
+}
+
+template <typename IndexType_, typename ValueType_> 
+void ImplicitArnoldi<IndexType_, ValueType_>::setup(Vector<ValueType>& initial_guess, const int restart_it, const int nEigVals)
+{
+    m_krylov_size = restart_it;
+    m_select = m_krylov_size;
+    m_nr_eigenvalues = nEigVals;
+
+    // We always compute an extra eigenvalue to make sure we always have m_nr_eigenvalues
+    // So even if the double shifted QR consume the m_n_eigenvalues^th eigenvalue we are fine
+    m_n_eigenvalues = m_nr_eigenvalues+1;
+
+    // General parameter check
+    if(m_krylov_size >= static_cast<int>(m_A.get_num_vertices())) 
+        FatalError("ARNOLDI: The krylov subspace size is larger than the matrix", NVGRAPH_ERR_BAD_PARAMETERS);
+    if(m_n_eigenvalues >= m_krylov_size) 
+        FatalError("ARNOLDI: The number of required eigenvalues +1 is larger than the maximum krylov subspace size", NVGRAPH_ERR_BAD_PARAMETERS);
+    if(m_krylov_size < 3) 
+        FatalError("ARNOLDI: Sould perform at least 3 iterations before restart", NVGRAPH_ERR_BAD_PARAMETERS);
+
+    // Some checks on optional Markov parameters
+    if (m_markov)
+    {
+        if (m_nr_eigenvalues != 1)
+            FatalError("ARNOLDI: Only one eigenpair is needed for the equilibrium of a Markov chain", NVGRAPH_ERR_BAD_PARAMETERS);
+        if (m_damping > 0.99999 || m_damping < 0.0001)
+           FatalError("ARNOLDI: Wrong damping factor value", NVGRAPH_ERR_BAD_PARAMETERS);
+    }
+
+    //if (m_laplacian)
+    //{
+    //   if (m_parts > m_n_eigenvalues)
+    //    FatalError("IRAM: ", NVGRAPH_ERR_BAD_PARAMETERS);
+    //}
+
+    // Some checks on optional miramns parameters
+    if ( m_nested_subspaces_freq <= 0)
+    {
+        m_nested_subspaces = 0;
+        m_miramns=false;
+    }
+    else
+    {
+        m_safety_lower_bound = 7;
+        if( m_nested_subspaces_freq > (m_krylov_size-(m_safety_lower_bound+m_nr_eigenvalues+1))) // ie not enough space betwen the number of ev and the max size of the subspace
+        {
+    #ifdef DEBUG
+            COUT()<<"MIRAMns Warning: Invalid frequence of nested subspaces, nested_subspaces_freq > m_max-4*n_eigVal" << std::endl;
+    #endif
+            m_miramns=false;
+        }
+        else
+        {
+            m_miramns=true;
+            // This formula should give the number of subspaces
+            // We allways count the smallest, the largest plus every size matching m_nested_subspaces_freq between them.
+            m_nested_subspaces = 2 + (m_krylov_size-(m_safety_lower_bound+m_nr_eigenvalues+1)-1)/m_nested_subspaces_freq;
+            
+            //COUT()<<"Number of nested subspaces : "<<m_nested_subspaces << std::endl;
+            //COUT()<<"nested_subspaces_freq "<< m_nested_subspaces_freq << std::endl;
+        }
+
+    }
+
+
+    m_residual = 1.0E6;
+    
+    //Allocations
+    size_t n = m_A.get_num_vertices();
+//    nnz is not used
+//    size_t nnz = m_A.get_num_edges();
+    // Device
+    m_V.allocate(n*(m_krylov_size + 1));
+    m_V_tmp.allocate(n*(m_n_eigenvalues + 1));
+    m_ritz_eigenvectors_d.allocate(m_krylov_size*m_krylov_size);
+    m_Q_d.allocate(m_krylov_size*m_krylov_size);
+    
+    //Host
+    m_Vi.resize(m_krylov_size + 1);
+    m_ritz_eigenvalues.resize(m_krylov_size);
+    m_ritz_eigenvalues_i.resize(m_krylov_size);
+    m_ritz_eigenvectors.resize(m_krylov_size * m_krylov_size);
+    m_H.resize(m_krylov_size * m_krylov_size);
+    m_H_select.resize(m_select*m_select);
+    m_H_tmp.resize(m_krylov_size * m_krylov_size);
+    m_Q.resize(m_krylov_size * m_krylov_size);
+    if(m_miramns)
+    {
+        m_mns_residuals.resize(m_nested_subspaces);
+        m_mns_beta.resize(m_nested_subspaces);
+    }
+
+    for (int i = 0; i < static_cast<int>(m_Vi.size()); ++i)
+    {
+        m_Vi[i]=m_V.raw()+i*n;
+    }
+    if (!has_init_guess)
+    {
+      const ValueType_ one  = 1;
+      const ValueType_ zero = 0;
+      curandGenerator_t randGen;
+      // Initialize random number generator
+      CHECK_CURAND(curandCreateGenerator(&randGen,CURAND_RNG_PSEUDO_PHILOX4_32_10));
+      CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(randGen, 123456/*time(NULL)*/));
+      // Initialize initial  vector
+      CHECK_CURAND(curandGenerateNormalX(randGen, m_V.raw(), n, zero, one));
+      ValueType_ normQ1 = Cublas::nrm2(n, m_V.raw(), 1);
+      Cublas::scal(n, (ValueType_)1.0/normQ1, m_V.raw(), 1);
+    }
+    else
+     {   
+        m_V.copy(initial_guess);
+    }
+    //dump_raw_vec (m_V.raw(), 10, 0);
+    if(m_markov)
+    {
+        update_dangling_nodes(n, m_a.raw(), static_cast<ValueType_>( m_damping));
+        //dump(m_a.raw(), 100, 0);
+        m_b.allocate(n);
+        ValueType_ val =  static_cast<float>(1.0/n); //
+        m_b.fill(val);
+        //m_b.dump(0,n);
+    }
+
+    if (m_laplacian)
+    {
+        // degree matrix
+        m_D.allocate(n);
+        m_b.allocate(n);
+        ValueType_ val = 1.0;
+        m_b.fill(val);
+        size_t n = m_A.get_num_vertices();
+        size_t nnz = m_A.get_num_edges();
+        ValueType_  alpha = 1.0, beta =0.0, gamma= -1.0; 
+
+#if __cplusplus > 199711L
+        Semiring sring = Semiring::PlusTimes;   
+#else 
+        Semiring sring = PlusTimes;   
+#endif
+        csrmv_mp<IndexType_, ValueType_>(n, n, nnz, alpha, m_A, m_b.raw(), beta, m_D.raw(), sring);
+        //Cusparse::csrmv(false, false, 
+        //    n, n, nnz,
+        //    &alpha,
+        //    m_A.get_raw_values(),
+        //    m_A.get_raw_row_offsets(),
+        //    m_A.get_raw_column_indices(),
+        //    m_b.raw(),
+        //    &beta,
+        //    m_D.raw());
+        Cublas::scal(nnz, gamma, m_A.get_raw_values(), 1);
+        
+        // m_b can be deleted now
+        //dump_raw_vec ( m_A.get_raw_values(), nnz, 0);
+        //dump_raw_vec (m_D.raw(), n, 0);
+    }
+
+
+    // normalize
+    Cublas::scal(n, (ValueType_)1.0/Cublas::nrm2(n, m_Vi[0], 1) , m_Vi[0], 1);
+    m_iterations = 0;
+    // arnoldi from 0 to k
+    solve_arnoldi(0,m_krylov_size);
+    
+}
+#ifdef DEBUG
+template <typename ValueType_>
+void dump_host_dense_mat(std::vector<ValueType_>& v, int ld)
+{
+    std::stringstream ss;
+    ss.str(std::string());
+    ss << std::setw(10);
+    ss.precision(3);        
+    for (int i = 0; i < ld; ++i)
+    {
+     for (int j = 0; j < ld; ++j)
+     {
+        ss << v[i*ld+j]  << std::setw(10);
+     }  
+     ss << std::endl;
+    }
+    COUT()<<ss.str();
+}
+
+template <typename ValueType_>
+void dump_host_vec(std::vector<ValueType_>& v)
+{
+    std::stringstream ss;
+    ss.str(std::string());
+    ss << std::setw(10);
+    ss.precision(4);        
+    for (int i = 0; i < v.size(); ++i)
+        ss << v[i]  << std::setw(10);
+     ss << std::endl;
+    COUT()<<ss.str();
+}
+#endif
+
+template <typename IndexType_, typename ValueType_>
+bool ImplicitArnoldi<IndexType_, ValueType_>::solve_arnoldi(int lower_bound, int upper_bound)
+{
+    int inc =1, mns_residuals_idx = 0;
+    size_t n = m_A.get_num_vertices();
+    size_t nnz = m_A.get_num_edges();
+
+    ValueType_  alpha = 1.0, beta =0.0, Hji = 0, dot_res; 
+   
+#if __cplusplus > 199711L
+    Semiring sring = Semiring::PlusTimes;   
+#else
+    Semiring sring = PlusTimes;   
+#endif
+    
+    //m_V.dump(lower_bound*n,n);
+    
+    if (m_miramns) 
+    {
+        std::fill (m_mns_residuals.begin(),m_mns_residuals.end(),0.0);
+    }
+
+    for (int i = lower_bound; i < upper_bound; ++i)
+    {
+        // beta = norm(f); v = f/beta; 
+        if (i>0 && i == lower_bound)
+        {
+            m_beta = Cublas::nrm2(n, m_Vi[i], 1);
+            // Vi = Vi/||Vi||
+            Cublas::scal(n, (ValueType_)1.0/m_beta, m_Vi[i], inc);
+            // m_V.dump((i-1)*n,n);
+        }
+
+        //  Compute H, V and f
+        csrmv_mp<IndexType_, ValueType_>(n, n, nnz, alpha, m_A, m_Vi[i], beta, m_Vi[i+1], sring);
+        //if (i == 0) dump_raw_vec (m_Vi[i+1], n, 0);
+        if (m_laplacian) 
+        {
+          //apply to the external diagonal
+          dmv(n, alpha, m_D.raw(), m_Vi[i], alpha, m_Vi[i+1]);
+          //dump_raw_vec ( m_D.raw(), 10, 0);
+          //dump_raw_vec (m_Vi[i+1], 10, 0);
+        }
+
+        if(m_markov)
+        {
+            Cublas::scal(n, m_damping, m_Vi[i+1], inc);
+            Cublas::dot(n, m_a.raw(), inc, m_Vi[i], inc, &dot_res); 
+            Cublas::axpy(n, dot_res, m_b.raw(), inc,  m_Vi[i+1], inc); 
+        }
+        
+        // Modified GS algorithm
+        for (int j = 0; j <= i; ++j)
+        {
+            // H(j,i) = AVi.Vj
+            Cublas::dot(n, m_Vi[i+1], inc, m_Vi[j], inc, &Hji);
+            m_H[i*m_krylov_size + j] = Hji;
+            //V(i + 1) -= H(j, i) * V(j) 
+            Cublas::axpy(n, -Hji, m_Vi[j],inc, m_Vi[i+1],inc);
+        }
+        if (i > 0)
+        {
+            // H(i+1,i) = ||Vi|| <=> H(i,i-1) = ||Vi||
+            m_H[(i-1)*m_krylov_size + i] = m_beta;
+        }
+        //||Vi+1||
+        m_beta = Cublas::nrm2(n, m_Vi[i+1], 1);
+        if (i+1 < upper_bound) 
+        {
+            
+            Cublas::scal(n, (ValueType_)1.0/m_beta, m_Vi[i+1], inc);
+        }
+
+        if (m_miramns) 
+        {
+            // The smallest subspaces is always m_safety_lower_bound+m_nr_eigenvalues+1
+            // The largest is allways max_krylov_size, 
+            // Between that we check the quality at every stride (m_nested_subspaces_freq).
+            if( i == m_safety_lower_bound+m_nr_eigenvalues || 
+                i+1 == upper_bound || 
+                (i > m_safety_lower_bound+m_nr_eigenvalues && ((i-(m_safety_lower_bound+m_nr_eigenvalues))%m_nested_subspaces_freq == 0)) )
+            {
+                //COUT()<<"i "<<i<<", idx "<<mns_residuals_idx << std::endl;
+                //dump_host_dense_mat(m_H, m_krylov_size);
+                compute_residual(i+1,true); // it is i+1 just because at an iteration i the subspace size is i+1
+                //m_mns_residuals[m_krylov_size-m_n_eigenvalues-(m_krylov_size-i)] = m_residual;
+                m_mns_beta[mns_residuals_idx] = m_beta;
+                //store current residual
+                m_mns_residuals[mns_residuals_idx] = m_residual; 
+                mns_residuals_idx++;
+
+                // early exit if converged
+                if (m_residual<m_tolerance) 
+                {    
+                    // prepare for exit here 
+                    //m_select = m_krylov_size-m_n_eigenvalues-(m_krylov_size-i)+1;
+                    m_select = i+1;
+
+                    if (m_laplacian)
+                    {
+                        SR(m_select); 
+                    }
+                    else if  (m_markov)
+                    {
+                         LR(m_select); 
+                    }
+                    else
+                    {
+                        LM(m_select); 
+                    }
+
+                    return true; 
+                }
+            }
+        }
+        
+    }
+     #ifdef IRAM_DEBUG
+       COUT()
+       <<"---------------------------------------------"<<std::endl
+       <<"                   ARNOLDI                     "<<std::endl
+       <<"---------------------------------------------"<<std::endl;
+       COUT()<<"V:"<<std::endl;
+       for (int i = 0; i < m_Vi.size()-1; ++i)
+           m_V.dump(n*i,n);
+       COUT()<<std::endl<<"f:"<<std::endl;
+       m_V.dump(n*m_krylov_size,n);
+       COUT()<<std::endl<<"H:"<<std::endl;
+       dump_host_dense_mat(m_H, m_krylov_size);
+    #endif
+   // dump_host_dense_mat(m_H, m_krylov_size);
+    // this is where we compute the residual after the arnoldi reduction in IRAM
+    if (!m_miramns)
+        compute_residual(m_krylov_size, true);
+    
+    return m_converged; // maybe we can optimize that later
+}
+
+template <typename IndexType_, typename ValueType_>
+bool ImplicitArnoldi<IndexType_, ValueType_>::solve_it()
+{
+
+    if (m_residual<m_tolerance) return true; // no need to do the k...p arnoldi steps
+
+    if (m_miramns)
+    {    
+        int prev = m_select;
+        select_subspace();
+        extract_subspace(prev);
+    }
+    implicit_restart();
+    
+    return solve_arnoldi(m_n_eigenvalues, m_krylov_size); // arnoldi from k to m
+}
+
+template <typename IndexType_, typename ValueType_>
+void ImplicitArnoldi<IndexType_, ValueType_>::select_subspace()
+{
+    #ifdef IRAM_DEBUG
+        COUT() <<std::endl << "Residuals "; dump_host_vec(m_mns_residuals);
+    #endif
+    
+#if __cplusplus > 199711L
+    typename std::vector<ValueType_>::iterator it = std::min_element(std::begin(m_mns_residuals), std::end(m_mns_residuals));
+#else
+    typename std::vector<ValueType_>::iterator it = std::min_element(m_mns_residuals.begin(), m_mns_residuals.end());
+#endif
+
+    m_residual = *it;
+#if __cplusplus > 199711L
+    int dist = static_cast<int>(std::distance(std::begin(m_mns_residuals), it));
+#else
+    int dist = static_cast<int>(std::distance(m_mns_residuals.begin(), it));
+#endif
+    m_select = std::min((m_safety_lower_bound+m_nr_eigenvalues) + (m_nested_subspaces_freq*dist) +1, m_krylov_size);
+    m_select_idx = dist ; 
+    //COUT()<<"m_select "<<m_select<< std::endl;
+}
+
+template <typename IndexType_, typename ValueType_>
+void ImplicitArnoldi<IndexType_, ValueType_>::extract_subspace(int m)
+{
+    
+    if (m != m_select || m_H_select.size() == 0)
+    {
+        m_H_select.resize(m_select*m_select);
+        m_H_tmp.resize(m_select*m_select);
+        m_Q.resize(m_select*m_select);
+        m_Q_tmp.resize(m_select*m_select);
+    }
+    //m_ritz_eigenvalues.resize(m_select);; //host
+    //m_ritz_eigenvectors.resize(m_select*m_select);
+    // copy
+    //int k = m_krylov_size-m_select;
+    //int l = 0;
+    //for(int i = k; i<m_krylov_size; i++)
+    //{
+    //    for(int j = 0; j<m_select; j++)
+    //    {
+    //       m_H_select[l*m_select+j] = m_H[i*m_krylov_size+j];
+    //    }
+    //    l++;
+    //}
+
+    for(int i = 0; i<m_select; i++)
+    {
+        for(int j = 0; j<m_select; j++)
+        {
+           m_H_select[i*m_select+j] = m_H[i*m_krylov_size+j];
+        }
+    }
+    // retrieve || f || if needed
+    if (m_select < m_krylov_size)
+        m_beta = m_mns_beta[m_select_idx];
+
+    m_dirty_bit = true;
+}
+
+template <typename IndexType_, typename ValueType_>
+void ImplicitArnoldi<IndexType_, ValueType_>::compute_residual(int subspace_size, bool dirty_bit)
+{
+    //dump_host_dense_mat(m_H_select, m_select);
+    if (m_miramns)  
+    {
+        
+        if (dirty_bit)
+        {
+            if (static_cast<int>(m_H_tmp.size()) != subspace_size*subspace_size)
+                m_H_tmp.resize(subspace_size*subspace_size);
+            //std::fill (m_ritz_eigenvalues.begin(),m_ritz_eigenvalues.end(),0.0);
+            //std::fill (m_ritz_eigenvectors.begin(),m_ritz_eigenvectors.end(),0.0);
+
+            for(int i = 0; i<subspace_size; i++)
+            {
+                for(int j = 0; j<subspace_size; j++)
+                {
+                   m_H_tmp[i*subspace_size+j] = m_H[i*m_krylov_size+j];
+                }
+            }
+            // dump_host_dense_mat(m_H_tmp,subspace_size);
+            //Lapack<ValueType_>::geev(&m_H_tmp[0], &m_ritz_eigenvalues[0], &m_ritz_eigenvectors[0], subspace_size , subspace_size, subspace_size);
+            Lapack<ValueType_>::geev(&m_H_tmp[0], &m_ritz_eigenvalues[0], &m_ritz_eigenvalues_i[0], &m_ritz_eigenvectors[0], NULL, subspace_size , subspace_size, subspace_size);
+        }
+    }
+    else
+    {
+        if (dirty_bit)
+        {
+            // we change m_H_tmp size during miramns
+            if (m_H_tmp.size() != m_H.size())
+                m_H_tmp.resize(m_H.size());
+            std::copy(m_H.begin(), m_H.end(), m_H_tmp.begin());
+            //Lapack<ValueType_>::geev(&m_H_tmp[0], &m_ritz_eigenvalues[0], &m_ritz_eigenvectors[0], m_krylov_size , m_krylov_size, m_krylov_size);
+            Lapack<ValueType_>::geev(&m_H_tmp[0], &m_ritz_eigenvalues[0],  &m_ritz_eigenvalues_i[0], &m_ritz_eigenvectors[0], NULL, m_krylov_size , m_krylov_size, m_krylov_size);
+        }
+    }
+
+    //COUT() << "m_ritz_eigenvalues : "<<std::endl;
+    //dump_host_vec(m_ritz_eigenvalues);
+    //COUT() << "m_ritz_eigenvectors : "<<std::endl;
+    //dump_host_dense_mat(m_ritz_eigenvectors, subspace_size);
+
+    // sort 
+    if (m_laplacian)
+    {
+        SR(subspace_size);
+    }
+    else if  (m_markov)
+    {
+          LR(m_select); 
+     }
+    else
+    {
+        LM(subspace_size); 
+    }
+    //COUT() << "m_ritz_eigenvalues : "<<std::endl;
+   // dump_host_vec(m_ritz_eigenvalues);
+    ValueType_ last_ritz_vector, residual_norm, tmp_residual;
+    ValueType_ lam;
+    m_residual = 0.0f;
+
+    // Convergence check  by approximating the residual of the Ritz pairs.
+    if  (m_markov)
+     {
+          last_ritz_vector = m_ritz_eigenvectors[subspace_size-1];
+           //COUT() << "last_ritz_vector : "<<last_ritz_vector<<std::endl;
+          // if (!last_ritz_vector)
+          //  dump_host_dense_mat(m_ritz_eigenvectors, subspace_size);
+          // COUT() << "m_beta : "<<m_beta<<std::endl;
+         m_residual = std::abs(last_ritz_vector * m_beta);
+         if (m_residual == 0.0)
+            m_residual = 1.0E6;
+     }
+     else
+    {
+        for (int i = 0; i < m_n_eigenvalues; i++)
+        {
+            last_ritz_vector = m_ritz_eigenvectors[i * subspace_size + subspace_size-1];
+            residual_norm = std::abs(last_ritz_vector * m_beta);
+           if(m_ritz_eigenvalues_i[i])
+               lam = std::sqrt(m_ritz_eigenvalues[i]*m_ritz_eigenvalues[i] + m_ritz_eigenvalues_i[i]*m_ritz_eigenvalues_i[i]);
+           else
+               lam = std::abs(m_ritz_eigenvalues[i]);
+
+            tmp_residual = residual_norm / lam;
+            //tmp_residual = residual_norm ;
+            //COUT() << "last_ritz_vector : "<<last_ritz_vector<<std::endl;
+            //COUT() << "res : "<<residual_norm<<std::endl;
+            //COUT() << "ri : "<<m_ritz_eigenvalues[i]<<std::endl;
+            //COUT() << "tmp : "<<tmp_residual<<std::endl;
+            if (m_residual<tmp_residual)
+                m_residual = tmp_residual;
+        }
+    }
+    //#ifdef IRAM_DEBUG
+        //COUT()<<std::endl << "Residual " << m_residual <<std::endl;
+        //COUT() << "m_ritz_eigenvalues : "<<std::endl;
+        //dump_host_vec(m_ritz_eigenvalues);
+        //COUT() << "m_ritz_eigenvectors : "<<std::endl;
+        //dump_host_dense_mat(m_ritz_eigenvectors, subspace_size);
+        //COUT() << "m_beta : " << m_beta <<std::endl;
+        //COUT() << "last_ritz_vector : " << last_ritz_vector <<std::endl;
+        //COUT() << "residual_norm : " << residual_norm <<std::endl;
+    //#endif
+
+    if (m_residual < m_tolerance)
+    {
+        m_converged = true;
+    }
+    else
+    {
+        m_converged = false;
+    }
+}
+
+template <typename IndexType_, typename ValueType_>
+void ImplicitArnoldi<IndexType_, ValueType_>::implicit_restart()
+{
+    // optim:  avoid the cpy here 
+    if (!m_miramns) std::copy(m_H.begin(), m_H.end(), m_H_select.begin());
+    select_shifts(m_dirty_bit);
+    #ifdef IRAM_DEBUG
+     for(int i = 0; i<m_n_eigenvalues; i++)
+       {
+            COUT() << m_ritz_eigenvalues[i];
+            if (m_ritz_eigenvalues_i[i])
+                COUT() << " " <<m_ritz_eigenvalues_i[i]<<std::endl;
+            else
+                COUT() <<std::endl;
+       }
+        COUT()<<std::endl
+       <<"---------------------------------------------"<<std::endl
+       <<"        KRYLOV SOLUTION           "<<std::endl
+       <<"---------------------------------------------"<<std::endl;
+        COUT() << "ritz_values : "<<std::endl;
+        dump_host_vec(m_ritz_eigenvalues);
+        COUT() << "ritz_vectors : "<<std::endl;
+        dump_host_dense_mat(m_ritz_eigenvectors, m_select);
+    #endif
+
+    qr_step();
+
+     #ifdef IRAM_DEBUG
+        COUT()<<std::endl
+       <<"---------------------------------------------"<<std::endl
+       <<"                SHIFTED QR                 "<<std::endl
+       <<"---------------------------------------------"<<std::endl;
+       COUT() << "H+"<< std::endl;
+       dump_host_dense_mat(m_H_select, m_select);
+       COUT() << "Q+"<< std::endl;
+       dump_host_dense_mat(m_Q, m_select);
+    #endif
+
+    refine_basis();
+
+     #ifdef IRAM_DEBUG
+        COUT()<<std::endl
+       <<"---------------------------------------------"<<std::endl
+       <<"           REFINED BASIS               "<<std::endl
+       <<"---------------------------------------------"<<std::endl;
+       int n = m_A.get_num_vertices();
+       COUT() << "V+ : "<<std::endl;  
+       for (int i = 0; i < m_n_eigenvalues; ++i)
+           m_V.dump(n*i,n);
+       COUT()<<std::endl<<"f+:"<<std::endl;
+       m_V.dump(n*m_n_eigenvalues,n);
+    #endif
+    // optim:  avoid the cpy here 
+    if (!m_miramns) std::copy(m_H_select.begin(), m_H_select.end(), m_H.begin());
+}
+
+template <typename IndexType_, typename ValueType_>
+void ImplicitArnoldi<IndexType_, ValueType_>::select_shifts(bool dirty_bit)
+{
+    // dirty_bit is false by default
+    if (dirty_bit)
+    {
+        std::copy(m_H_select.begin(), m_H_select.end(), m_H_tmp.begin()); 
+        //Lapack<ValueType_>::geev(&m_H_tmp[0], &m_ritz_eigenvalues[0], &m_ritz_eigenvectors[0], m_select , m_select, m_select);
+        Lapack<ValueType_>::geev(&m_H_tmp[0], &m_ritz_eigenvalues[0],&m_ritz_eigenvalues_i[0], &m_ritz_eigenvectors[0], NULL, m_select , m_select, m_select);
+        // #ifdef IRAM_DEBUG
+        //     COUT() << "m_ritz_eigenvalues : "<<std::endl;
+        //     dump_host_vec(m_ritz_eigenvalues);
+        //     COUT() << "m_ritz_eigenvectors : "<<std::endl;
+        //     dump_host_dense_mat(m_ritz_eigenvectors, m_select);
+        // #endif
+    }
+    m_dirty_bit = false;
+    if (m_laplacian)
+    {
+        SR(m_select); 
+    }
+    else if  (m_markov)
+    {
+         LR(m_select); 
+    }
+    else
+    {
+        LM(m_select); 
+    }
+    // in the future we can quikly add LM, SM, SR
+    // complex (LI SI) are not supported.
+
+}
+
+
+#if __cplusplus <= 199711L
+    template<typename ValueType_>
+    bool cmp_LR(const std::pair<int,ValueType_> &left, const std::pair<int,ValueType_> &right){
+        return left.second > right.second;
+    };
+#endif
+
+
+template <typename IndexType_, typename ValueType_>
+void ImplicitArnoldi<IndexType_, ValueType_>::LR(int subspace_sz)
+{
+    // Eigen values of interest have the largest real part 
+    std::vector<std::pair<int,ValueType_> > items;
+    for (int i = 0; i < subspace_sz; ++i)
+        items.push_back(std::make_pair( i, m_ritz_eigenvalues[i]));
+
+    // this is a reverse  key value sort by algebraic value
+    // in this case we select the largest eigenvalues
+    // In the future we can add other shift selection strategies here
+    // to converge to different eigen values (reverse sort by magnitude, or usual sort by magnitude etc ).
+#if __cplusplus > 199711L
+    std::sort(items.begin(), items.end(),[](const std::pair<int,ValueType_> &left, const std::pair<int,ValueType_> &right) 
+                                             {return left.second > right.second; });
+#else
+    std::sort(items.begin(), items.end(), cmp_LR<ValueType_>);
+#endif
+
+    // Now we need to reorder the vectors accordingly
+    std::vector<ValueType_> ritz_tmp(m_ritz_eigenvectors);
+
+    for (int i = 0; i < subspace_sz; ++i)
+    {
+        //COUT() << "reordrering : " << items[i].first <<std::endl
+        //                 << "start : " <<items[i].first*subspace_sz<<std::endl
+        //                 << "end : " <<items[i].first*subspace_sz+subspace_sz<<std::endl
+        //                 << "out : " <<i*subspace_sz<<std::endl;
+
+        std::copy(ritz_tmp.begin() + (items[i].first*subspace_sz), 
+                           ritz_tmp.begin() + (items[i].first*subspace_sz + subspace_sz), 
+                           m_ritz_eigenvectors.begin()+(i*subspace_sz)); 
+        m_ritz_eigenvalues[i] = items[i].second;
+    }
+    // dump_host_vec(m_ritz_eigenvalues);
+    std::vector<ValueType_> tmp_i(m_ritz_eigenvalues_i);
+    for (int i = 0; i < subspace_sz; ++i)
+    {
+        m_ritz_eigenvalues_i[i] = tmp_i[items[i].first];
+    }
+}
+
+
+template<typename ValueType_>
+bool cmp_LM(const std::pair<int,ValueType_> &left, const std::pair<int,ValueType_> &right){
+    return left.second > right.second;
+};
+
+template <typename IndexType_, typename ValueType_>
+void ImplicitArnoldi<IndexType_, ValueType_>::LM(int subspace_sz)
+{ 
+    std::vector<ValueType_> magnitude(subspace_sz);
+    std::vector<std::pair<int, ValueType_ > > kv;
+    
+    for (int i = 0; i < subspace_sz; ++i)
+       magnitude[i] = m_ritz_eigenvalues[i]*m_ritz_eigenvalues[i] + m_ritz_eigenvalues_i[i]*m_ritz_eigenvalues_i[i];
+    
+    for (int i = 0; i < subspace_sz; ++i)
+        kv.push_back(std::make_pair( i, magnitude[i]));
+
+    // this is a reverse  key value sort by magnitude 
+    // in this case we select the largest magnitude
+
+    std::sort(kv.begin(), kv.end(), cmp_LM<ValueType_>);
+
+    // Now we need to reorder the vectors accordingly
+    std::vector<ValueType_> ritz_tmp(m_ritz_eigenvectors);
+    std::vector<ValueType_> ev(m_ritz_eigenvalues);
+    std::vector<ValueType_> ev_i(m_ritz_eigenvalues_i);
+    for (int i = 0; i < subspace_sz; ++i)
+    {
+        //COUT() << "reordrering : " << kv[i].first <<std::endl
+        //                 << "start : " <<kv[i].first*subspace_sz<<std::endl
+        //                 << "end : " <<kv[i].first*subspace_sz+subspace_sz<<std::endl
+        //                 << "out : " <<i*subspace_sz<<std::endl;
+        std::copy(ritz_tmp.begin() + (kv[i].first*subspace_sz), 
+                  ritz_tmp.begin() + (kv[i].first*subspace_sz + subspace_sz), 
+                  m_ritz_eigenvectors.begin()+(i*subspace_sz)); 
+        m_ritz_eigenvalues[i] = ev[kv[i].first];
+        m_ritz_eigenvalues_i[i] = ev_i[kv[i].first];
+    }
+}
+
+#if __cplusplus <= 199711L
+    template<typename ValueType_>
+    bool cmp_SR(const std::pair<int,ValueType_> &left, const std::pair<int,ValueType_> &right){
+        return left.second < right.second;
+    };
+#endif
+
+template <typename IndexType_, typename ValueType_>
+void ImplicitArnoldi<IndexType_, ValueType_>::SR(int subspace_sz)
+{
+    // Eigen values of interest have the largest real part 
+    std::vector<std::pair<int,ValueType_> > items;
+    for (int i = 0; i < subspace_sz; ++i)
+        items.push_back(std::make_pair( i, m_ritz_eigenvalues[i]));
+
+    // this is a reverse  key value sort by algebraic value
+    // in this case we select the largest eigenvalues
+    // In the future we can add other shift selection strategies here
+    // to converge to different eigen values (reverse sort by magnitude, or usual sort by magnitude etc ).
+#if __cplusplus > 199711L
+    std::sort(items.begin(), items.end(),[](const std::pair<int,ValueType_> &left, const std::pair<int,ValueType_> &right) 
+                                             {return left.second < right.second; });
+#else
+    std::sort(items.begin(), items.end(), cmp_SR<ValueType_>);
+#endif
+
+    // Now we need to reorder the vectors accordingly
+    std::vector<ValueType_> ritz_tmp(m_ritz_eigenvectors);
+
+    for (int i = 0; i < subspace_sz; ++i)
+    {
+        //COUT() << "reordrering : " << items[i].first <<std::endl
+        //                 << "start : " <<items[i].first*subspace_sz<<std::endl
+        //                 << "end : " <<items[i].first*subspace_sz+subspace_sz<<std::endl
+        //                 << "out : " <<i*subspace_sz<<std::endl;
+
+        std::copy(ritz_tmp.begin() + (items[i].first*subspace_sz), 
+                           ritz_tmp.begin() + (items[i].first*subspace_sz + subspace_sz), 
+                           m_ritz_eigenvectors.begin()+(i*subspace_sz)); 
+        m_ritz_eigenvalues[i] = items[i].second;
+    }
+    // dump_host_vec(m_ritz_eigenvalues);
+}
+
+template <typename IndexType_, typename ValueType_>
+void ImplicitArnoldi<IndexType_, ValueType_>::qr_step()
+{   
+    ValueType_ mu, mu_i, mu_i_sq;
+    int n = m_select;
+    int ld = m_select;
+    std::vector<ValueType> tau(n);
+    std::vector<ValueType> work(n);
+    int lwork = -1; 
+    // workspace query
+    std::copy (m_H_select.begin(),m_H_select.end(), m_H_tmp.begin());
+    Lapack<ValueType_>::geqrf(n, n, &m_H_tmp[0], ld, &tau[0], &work[0], &lwork);
+    // work is a real array used as workspace. On exit, if LWORK = -1, work[0] contains the optimal LWORK.
+    // it can be safely casted to int here to remove the conversion warning.
+    lwork = static_cast<int>(work[0]);
+    work.resize(lwork);
+    // Q0 = I
+    m_Q.assign(m_Q.size(),0.0);
+    shift(m_Q, m_select, m_select, -1);
+    //for (int j = 0; j < m_select; j++)
+    //    m_Q[j*m_select+j] = 1.0;
+   
+    #ifdef IRAM_DEBUG
+        COUT() << "m_ritz_eigenvalues : "<<std::endl;
+        dump_host_vec(m_ritz_eigenvalues);
+        COUT() << "H0 : "<<std::endl;
+        dump_host_dense_mat(m_H_select, m_select);
+        COUT() << "Q0 : "<<std::endl;
+        dump_host_dense_mat(m_Q, m_select);
+        COUT() << "Lwork : " << lwork <<std::endl;
+    #endif
+    int i = m_select-1;
+    while (i >= m_n_eigenvalues)
+    {
+        //Get the shift
+        mu_i = m_ritz_eigenvalues_i[i];
+        mu = m_ritz_eigenvalues[i];
+        shift(m_H_tmp, m_select, m_select, mu);
+
+        if (mu_i )
+        {
+            //Complex case
+            //Double shift
+            //(H - re_mu*I)^2 + im_mu^2*I)
+
+            if (i==m_n_eigenvalues)
+            {
+                // if we are in this case we will consume the  next eigen value which is a wanted eigenalue
+                // fortunately  m_n_eigenvalues = m_nr_eigenvalues +1 (we alway compute one more eigenvalue)
+                m_n_eigenvalues -=1;
+
+                //COUT() << "IRAM: last ev absorded in double shift" <<std::endl;
+            } 
+            //COUT() << "Complex  shift"<<std::endl;
+            //COUT() << "shift : " << mu  << " " << mu_i << "i" <<std::endl;   
+            std::vector<ValueType> A(m_select*m_select);
+            
+            for (int ii = 0; ii < m_select; ii++)
+                for (int k = 0; k < m_select; k++)
+                    for (int j = 0; j < m_select; j++)
+                        A[ii*m_select+j] +=  m_H_tmp[ii*m_select+k]* m_H_tmp[k*m_select+j];
+            mu_i_sq = mu_i*mu_i;
+            std::copy (A.begin(),A.end(), m_H_tmp.begin());
+            shift(m_H_tmp, m_select, m_select, -mu_i_sq);
+
+             //COUT() << "H"<< m_select-i<<std::endl;
+             //dump_host_dense_mat(m_H_tmp, m_select);
+        }
+
+          // [Q,R] = qr(H - mu*I);
+         Lapack<ValueType_>::geqrf(n, n, &m_H_tmp[0], ld, &tau[0], &work[0], &lwork);
+        //H+ = (Q)'* H * Q ;
+        Lapack<ValueType_>::ormqr(false, true, n, n, n, &m_H_tmp[0], ld, &tau[0], &m_H_select[0], n, &work[0], &lwork);
+        Lapack<ValueType_>::ormqr(true, false, n, n, n, &m_H_tmp[0], ld, &tau[0], &m_H_select[0], n, &work[0], &lwork);
+        
+        //Q+ = Q+*Q; 
+        Lapack<ValueType_>::ormqr(true, false, n, n, n, &m_H_tmp[0], ld, &tau[0], &m_Q[0], n, &work[0], &lwork);
+         
+        // clean up below subdiagonal (column major storage)
+
+        cleanup_subspace(m_H_select, m_select,m_select);
+        //for (int j = 0; j < m_select-1; j++)
+        //   for (int k = j+2; k < m_select; k++)
+        //       m_H_select[j*m_select + k] = 0;
+         
+        //COUT() << "shift : " << mu <<std::endl;   
+        //COUT() << "H"<< m_select-i<<std::endl;
+        //dump_host_dense_mat(m_H_select, m_select);
+        //COUT() << "Q"<< m_select-i <<std::endl;
+        //dump_host_dense_mat(m_Q, m_select);
+
+        std::copy (m_H_select.begin(),m_H_select.end(), m_H_tmp.begin());
+        // Example for how to explicitly form Q
+        // Lapack<ValueType_>::orgqr(n, n, n, &m_H_tmp[0], ld, &tau[0], &work[0], &lwork); 
+        // std::copy (m_H_tmp.begin(),m_H_tmp.end(), m_Q.begin());
+        if (mu_i) 
+              i-=2; //complex
+        else 
+              i-=1; //real
+    }
+
+}
+
+template <typename IndexType_, typename ValueType_>
+void ImplicitArnoldi<IndexType_, ValueType_>::refine_basis()
+{
+    ValueType_ alpha, beta;
+
+    // update f (and send on dev at some point) 
+    // Back to row major -> transpose Q and mind which element we pick in H (ie stored as Ht).
+    // copy Q to dev 
+    // Need Mat1*Mat2, where Mat1(n,m) is tall, skin, dense and  Mat2(m,l) is small dense with l<m and m<<n
+
+    // something like f+1 = V(:,1:m)*Q(:,n_ev+1)*H(n_ev+1,n_ev) + f*Q(m,n_ev);
+    // ie vec =  Lmat Svec scal +Svec scal , all dense (L=large S=small)
+    // just local small name for variables
+    int n = m_A.get_num_vertices(),
+        nev = m_n_eigenvalues,
+        nk = m_select;
+    
+    m_Q_d.fill(0);
+
+    ValueType_ *fptr = m_V_tmp.raw()+n*nev; // = Vi[nev]
+    cudaMemcpyAsync(m_Q_d.raw(), &m_Q[0], (size_t)(m_select*m_select*sizeof(m_Q[0])), cudaMemcpyHostToDevice); cudaCheckError();
+    cudaMemcpyAsync(fptr, m_Vi[nk], (size_t)(n*sizeof(ValueType_)), cudaMemcpyDeviceToDevice); cudaCheckError();
+
+    alpha = m_Q[(nev-1) * nk + nk - 1];
+    beta = 1.0;
+
+    // retrieve f from v[m_select] if needed 
+    // We could also store the vector f for each nested subspace 
+    if (m_select!=m_krylov_size)
+        Cublas::scal(n, m_beta, fptr, 1);
+    
+    Cublas::scal(n, alpha, fptr, 1);
+
+    alpha = m_H_select[(nev-1) * nk + nev ];
+
+    Cublas::gemm(false, false, n, 1, nk, &alpha, m_V.raw(), n, m_Q_d.raw(), nk, &beta, fptr, n);
+    
+    //COUT() << "f+ : "<<std::endl;   
+    //m_V_tmp.dump(2*n,n);
+    //COUT() <<std::endl;   
+
+    //V(:,1:m)*Q(:,n_ev+1)*H(n_ev+1,n_ev)
+    // ie Lmat =  Lmat * Smat, all dense (L=large S=small)
+    // <=> tmpT = H(n_ev, n_ev+1) V*Q in col maj
+    
+    alpha = 1.0;
+    beta = 0.0;
+
+    // debug cleaning
+    //m_Q_d.fill(0);
+    //cudaMemcpyAsync(m_Q_d.raw(), &m_Q[0], (size_t)(nev*m_select*sizeof(m_Q[0])), cudaMemcpyHostToDevice);
+    //fill_raw_vec (m_V_tmp.raw(), n*(nev+1), beta);
+    //fill_raw_vec (m_V.raw()+n*nk, n, beta);
+    
+    //COUT() << "QT : "<<std::endl;   
+    //m_Q_d.dump(0,m_select);
+    //m_Q_d.dump(1*m_select, m_select);
+    //m_Q_d.dump(2*m_select, m_select);
+    //m_Q_d.dump(3*m_select, m_select);
+    //COUT() <<std::endl;   
+    
+    //COUT() << "VT : "<<std::endl;   
+    //m_V.dump(0,n);
+    //m_V.dump(1*n,n);
+    //m_V.dump(2*n,n);
+    //m_V.dump(3*n,n);
+    ////m_V.dump(4*n,n);
+    //COUT() <<std::endl;   
+
+    //cudaDeviceSynchronize();
+
+    Cublas::gemm(false, false, n, nev, nk, &alpha, m_V.raw(), n, m_Q_d.raw(), nk, 
+                                           &beta, m_V_tmp.raw(), n);
+
+    m_V.copy(m_V_tmp);
+
+    // update H
+    if (m_miramns)
+    {
+        for(int i = 0; i<m_select; i++)
+            for(int j = 0; j<m_select; j++)
+               m_H[i*m_krylov_size+j] = m_H_select[i*m_select+j];
+        cleanup_subspace(m_H, m_krylov_size,m_n_eigenvalues);
+    }
+}   
+
+template <typename IndexType_, typename ValueType_>
+void ImplicitArnoldi<IndexType_, ValueType_>::compute_eigenvectors()
+{
+    //dump_host_vec(m_ritz_eigenvalues);
+    //dump_host_dense_mat(m_ritz_eigenvectors,m_select);
+    int n = m_A.get_num_vertices(),
+        nev = m_nr_eigenvalues,
+        nk = m_select;
+    ValueType_ alpha=1.0, beta = 0.0;
+    cudaMemcpyAsync(m_ritz_eigenvectors_d.raw(), &m_ritz_eigenvectors[0], (size_t)(m_select*m_select*sizeof(m_ritz_eigenvectors[0])), cudaMemcpyHostToDevice);
+    cudaCheckError();
+    Cublas::gemm(false, false, n, nev, nk, &alpha, m_V.raw(), n, 
+                 m_ritz_eigenvectors_d.raw(), nk, 
+                 &beta,  m_eigenvectors.raw(), n);
+    //nrm 1 for pagerank
+    if(m_markov) 
+        Cublas::scal(n, (ValueType_)1.0/m_eigenvectors.nrm1(), m_eigenvectors.raw(), 1);
+    
+    #ifdef IRAM_DEBUG
+        COUT()<<std::endl
+       <<"---------------------------------------------"<<std::endl
+       <<"             EIGENVECTORS            "<<std::endl
+       <<"---------------------------------------------"<<std::endl;
+       for (int i = 0; i < m_nr_eigenvalues; ++i)
+           m_eigenvectors.dump(n*i,n);
+        COUT() <<std::endl;  
+    #endif
+
+}
+
+template <typename IndexType_, typename ValueType_>
+void ImplicitArnoldi<IndexType_, ValueType_>::cleanup_subspace(std::vector<ValueType_>& v, int ld, int new_sz)
+{
+
+    // just a simple clean
+
+    //    In               Out
+    // * * 0 0 0        * * 0 0 0   
+    // * * * 0 0        * * * 0 0 
+    // * * * * 0        * * * * 0 
+    // * * * * *        * * * * 0  <--- new_sz
+    // * * * * *        0 0 0 0 0
+
+    for (int i = 0; i < new_sz-1; i++)
+      for (int j = i+2; j < new_sz; j++)
+          v[i*ld + j] = 0;
+    for (int i = new_sz; i < ld; i++)
+      for (int j = 0; j < ld; j++)
+        v[i*ld + j] = 0;
+    for (int i = 0; i < new_sz; i++)
+      for (int j = new_sz; j < ld; j++)
+        v[i*ld + j] = 0;
+
+    // Not used anymore
+    //    In               Out
+    // * * 0 0 0        0 0 0 0 0   
+    // * * * 0 0        0 0 0 0 0 
+    // * * * * 0        * * 0 0 0 <--- new_sz
+    // * * * * *        * * * 0 0
+    // * * * * *        * * * 0 0
+    //int k = ld-new_sz;
+    //for (int i = 0; i < ld; ++i)
+    // for (int j = 0; j < ld; ++j)
+    //    if ((i < k) ||  
+    //        (j >= new_sz) || 
+    //        (i >= k && j-1 > i-k ))        
+    //            v[i*ld+j] = 0.0;  
+
+}
+
+template <typename IndexType_, typename ValueType_>
+void ImplicitArnoldi<IndexType_, ValueType_>::shift(std::vector<ValueType_>& H, int ld, int m, ValueType mu)
+{
+    #ifdef IRAM_DEBUG
+        dump_host_dense_mat(H,ld);
+    #endif
+    int start = ld-m;
+    for (int i = start; i < ld; i++)
+        H[i*ld+i-start] -= mu;
+    #ifdef IRAM_DEBUG
+        dump_host_dense_mat(H,ld);
+    #endif
+}
+
+template <typename IndexType_, typename ValueType_>
+std::vector<ValueType_> ImplicitArnoldi<IndexType_, ValueType_>::get_f_copy()
+{
+    std::vector<ValueType> tmp(m_A.get_num_vertices());
+    cudaMemcpyAsync(&tmp[0],m_Vi[m_krylov_size], (size_t)(m_A.get_num_vertices()*sizeof(ValueType_)), cudaMemcpyDeviceToHost);
+    cudaCheckError();
+    return tmp;
+}
+
+template <typename IndexType_, typename ValueType_>
+std::vector<ValueType_> ImplicitArnoldi<IndexType_, ValueType_>::get_fp_copy()
+{
+    std::vector<ValueType> tmp(m_A.get_num_vertices());
+    cudaMemcpyAsync(&tmp[0],m_Vi[m_n_eigenvalues], (size_t)(m_A.get_num_vertices()*sizeof(ValueType_)), cudaMemcpyDeviceToHost);
+    cudaCheckError();
+    return tmp;
+}
+
+template <typename IndexType_, typename ValueType_>
+std::vector<ValueType_> ImplicitArnoldi<IndexType_, ValueType_>::get_V_copy()
+{
+    std::vector<ValueType> tmp(m_A.get_num_vertices()*(m_krylov_size+1));
+    cudaMemcpyAsync(&tmp[0],m_V.raw(), (size_t)(m_A.get_num_vertices()*(m_krylov_size+1)*sizeof(ValueType_)), cudaMemcpyDeviceToHost);
+    cudaCheckError();
+    return tmp;
+}
+
+
+template class ImplicitArnoldi<int, double>;
+template class ImplicitArnoldi<int, float>;
+} // end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/src/bfs.cu b/cpp/nvgraph/cpp/src/bfs.cu
new file mode 100644
index 00000000000..218f01a87ac
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/bfs.cu
@@ -0,0 +1,560 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <iomanip>
+#include "bfs.hxx"
+#include <limits>
+
+#include "nvgraph_error.hxx"
+#include "bfs_kernels.cu"
+
+using namespace bfs_kernels;
+
+namespace nvgraph {
+	enum BFS_ALGO_STATE {
+		TOPDOWN, BOTTOMUP
+	};
+
+	template<typename IndexType>
+	NVGRAPH_ERROR Bfs<IndexType>::setup() {
+
+		// Determinism flag, false by default
+		deterministic = false;
+		//Working data
+		//Each vertex can be in the frontier at most once
+		cudaMalloc(&frontier, n * sizeof(IndexType));
+		cudaCheckError()
+		;
+
+		//We will update frontier during the execution
+		//We need the orig to reset frontier, or cudaFree
+		original_frontier = frontier;
+
+		//size of bitmaps for vertices
+		vertices_bmap_size = (n / (8 * sizeof(int)) + 1);
+		//ith bit of visited_bmap is set <=> ith vertex is visited
+		cudaMalloc(&visited_bmap, sizeof(int) * vertices_bmap_size);
+		cudaCheckError()
+		;
+
+		//ith bit of isolated_bmap is set <=> degree of ith vertex = 0
+		cudaMalloc(&isolated_bmap, sizeof(int) * vertices_bmap_size);
+		cudaCheckError()
+		;
+
+		//vertices_degree[i] = degree of vertex i
+		cudaMalloc(&vertex_degree, sizeof(IndexType) * n);
+		cudaCheckError()
+		;
+
+		//Cub working data
+		cub_exclusive_sum_alloc(n + 1, d_cub_exclusive_sum_storage, cub_exclusive_sum_storage_bytes);
+
+		//We will need (n+1) ints buffer for two differents things (bottom up or top down) - sharing it since those uses are mutually exclusive
+		cudaMalloc(&buffer_np1_1, (n + 1) * sizeof(IndexType));
+		cudaCheckError()
+		;
+		cudaMalloc(&buffer_np1_2, (n + 1) * sizeof(IndexType));
+		cudaCheckError()
+		;
+
+		//Using buffers : top down
+
+		//frontier_vertex_degree[i] is the degree of vertex frontier[i]
+		frontier_vertex_degree = buffer_np1_1;
+		//exclusive sum of frontier_vertex_degree
+		exclusive_sum_frontier_vertex_degree = buffer_np1_2;
+
+		//Using buffers : bottom up
+
+		//contains list of unvisited vertices
+		unvisited_queue = buffer_np1_1;
+		//size of the "last" unvisited queue : size_last_unvisited_queue
+		//refers to the size of unvisited_queue
+		//which may not be up to date (the queue may contains vertices that are now visited)
+
+		//We may leave vertices unvisited after bottom up main kernels - storing them here
+		left_unvisited_queue = buffer_np1_2;
+
+		//We use buckets of edges (32 edges per bucket for now, see exact macro in bfs_kernels). frontier_vertex_degree_buckets_offsets[i] is the index k such as frontier[k] is the source of the first edge of the bucket
+		//See top down kernels for more details
+		cudaMalloc(	&exclusive_sum_frontier_vertex_buckets_offsets,
+						((nnz / TOP_DOWN_EXPAND_DIMX + 1) * NBUCKETS_PER_BLOCK + 2) * sizeof(IndexType));
+		cudaCheckError()
+		;
+
+		//Init device-side counters
+		//Those counters must be/can be reset at each bfs iteration
+		//Keeping them adjacent in memory allow use call only one cudaMemset - launch latency is the current bottleneck
+		cudaMalloc(&d_counters_pad, 4 * sizeof(IndexType));
+		cudaCheckError()
+		;
+
+		d_new_frontier_cnt = &d_counters_pad[0];
+		d_mu = &d_counters_pad[1];
+		d_unvisited_cnt = &d_counters_pad[2];
+		d_left_unvisited_cnt = &d_counters_pad[3];
+
+		//Lets use this int* for the next 3 lines
+		//Its dereferenced value is not initialized - so we dont care about what we put in it
+		IndexType * d_nisolated = d_new_frontier_cnt;
+		cudaMemsetAsync(d_nisolated, 0, sizeof(IndexType), stream);
+		cudaCheckError()
+		;
+
+		//Computing isolated_bmap
+		//Only dependent on graph - not source vertex - done once
+		flag_isolated_vertices(n, isolated_bmap, row_offsets, vertex_degree, d_nisolated, stream);
+		cudaMemcpyAsync(&nisolated, d_nisolated, sizeof(IndexType), cudaMemcpyDeviceToHost, stream);
+		cudaCheckError()
+		;
+
+		//We need nisolated to be ready to use
+		cudaStreamSynchronize(stream);
+		cudaCheckError()
+		;
+
+		return NVGRAPH_OK;
+	}
+
+	template<typename IndexType>
+	NVGRAPH_ERROR Bfs<IndexType>::configure(	IndexType *_distances,
+															IndexType *_predecessors,
+															int *_edge_mask)
+															{
+		distances = _distances;
+		predecessors = _predecessors;
+		edge_mask = _edge_mask;
+
+		useEdgeMask = (edge_mask != NULL);
+		computeDistances = (distances != NULL);
+		computePredecessors = (predecessors != NULL);
+
+		//We need distances to use bottom up
+		if (directed && !computeDistances)
+			cudaMalloc(&distances, n * sizeof(IndexType));
+
+		cudaCheckError()
+		;
+
+		return NVGRAPH_OK;
+	}
+
+	template<typename IndexType>
+	NVGRAPH_ERROR Bfs<IndexType>::traverse(IndexType source_vertex) {
+
+		//Init visited_bmap
+		//If the graph is undirected, we not that
+		//we will never discover isolated vertices (in degree = out degree = 0)
+		//we avoid a lot of work by flagging them now
+		//in g500 graphs they represent ~25% of total vertices
+		//more than that for wiki and twitter graphs
+
+		if (directed) {
+			cudaMemsetAsync(visited_bmap, 0, vertices_bmap_size * sizeof(int), stream);
+		} else {
+			cudaMemcpyAsync(	visited_bmap,
+									isolated_bmap,
+									vertices_bmap_size * sizeof(int),
+									cudaMemcpyDeviceToDevice,
+									stream);
+		}
+		cudaCheckError()
+		;
+
+		//If needed, setting all vertices as undiscovered (inf distance)
+		//We dont use computeDistances here
+		//if the graph is undirected, we may need distances even if
+		//computeDistances is false
+		if (distances)
+			fill_vec(distances, n, vec_t<IndexType>::max, stream);
+
+		//If needed, setting all predecessors to non-existent (-1)
+		if (computePredecessors)
+		{
+			cudaMemsetAsync(predecessors, -1, n * sizeof(IndexType), stream);
+			cudaCheckError()
+			;
+		}
+
+		//
+		//Initial frontier
+		//
+
+		frontier = original_frontier;
+
+		if (distances)
+		{
+			cudaMemsetAsync(&distances[source_vertex], 0, sizeof(IndexType), stream);
+			cudaCheckError()
+			;
+		}
+
+		//Setting source_vertex as visited
+		//There may be bit already set on that bmap (isolated vertices) - if the graph is undirected
+		int current_visited_bmap_source_vert = 0;
+
+		if (!directed) {
+			cudaMemcpyAsync(&current_visited_bmap_source_vert,
+									&visited_bmap[source_vertex / INT_SIZE],
+									sizeof(int),
+									cudaMemcpyDeviceToHost);
+			cudaCheckError()
+			;
+			//We need current_visited_bmap_source_vert
+			cudaStreamSynchronize(stream);
+			cudaCheckError()
+			;
+			//We could detect that source is isolated here
+		}
+
+		int m = (1 << (source_vertex % INT_SIZE));
+
+		//In that case, source is isolated, done now
+		if (!directed && (m & current_visited_bmap_source_vert)) {
+			//Init distances and predecessors are done, (cf Streamsync in previous if)
+			cudaCheckError()
+			;
+			return NVGRAPH_OK;
+		}
+
+		m |= current_visited_bmap_source_vert;
+
+		cudaMemcpyAsync(	&visited_bmap[source_vertex / INT_SIZE],
+								&m,
+								sizeof(int),
+								cudaMemcpyHostToDevice,
+								stream);
+		cudaCheckError()
+		;
+
+		//Adding source_vertex to init frontier
+		cudaMemcpyAsync(	&frontier[0],
+								&source_vertex,
+								sizeof(IndexType),
+								cudaMemcpyHostToDevice,
+								stream);
+		cudaCheckError()
+		;
+
+		//mf : edges in frontier
+		//nf : vertices in frontier
+		//mu : edges undiscovered
+		//nu : nodes undiscovered
+		//lvl : current frontier's depth
+		IndexType mf, nf, mu, nu;
+		bool growing;
+		IndexType lvl = 1;
+
+		//Frontier has one vertex
+		nf = 1;
+
+		//all edges are undiscovered (by def isolated vertices have 0 edges)
+		mu = nnz;
+
+		//all non isolated vertices are undiscovered (excepted source vertex, which is in frontier)
+		//That number is wrong if source_vertex is also isolated - but it's not important
+		nu = n - nisolated - nf;
+
+		//Last frontier was 0, now it is 1
+		growing = true;
+
+		IndexType size_last_left_unvisited_queue = n; //we just need value > 0
+		IndexType size_last_unvisited_queue = 0; //queue empty
+
+		//Typical pre-top down workflow. set_frontier_degree + exclusive-scan
+		set_frontier_degree(frontier_vertex_degree, frontier, vertex_degree, nf, stream);
+		exclusive_sum(	d_cub_exclusive_sum_storage,
+							cub_exclusive_sum_storage_bytes,
+							frontier_vertex_degree,
+							exclusive_sum_frontier_vertex_degree,
+							nf + 1,
+							stream);
+
+		cudaMemcpyAsync(	&mf,
+								&exclusive_sum_frontier_vertex_degree[nf],
+								sizeof(IndexType),
+								cudaMemcpyDeviceToHost,
+								stream);
+		cudaCheckError()
+		;
+
+		//We need mf
+		cudaStreamSynchronize(stream);
+		cudaCheckError()
+		;
+
+		//At first we know we have to use top down
+		BFS_ALGO_STATE algo_state = TOPDOWN;
+
+		//useDistances : we check if a vertex is a parent using distances in bottom up - distances become working data
+		//undirected g : need parents to be in children's neighbors
+		bool can_use_bottom_up = !directed && distances;
+
+		while (nf > 0) {
+			//Each vertices can appear only once in the frontierer array - we know it will fit
+			new_frontier = frontier + nf;
+			IndexType old_nf = nf;
+			resetDevicePointers();
+
+			if (can_use_bottom_up) {
+				//Choosing algo
+				//Finite machine described in http://parlab.eecs.berkeley.edu/sites/all/parlab/files/main.pdf
+
+				switch (algo_state) {
+				case TOPDOWN:
+					if (mf > mu / alpha)
+						algo_state = BOTTOMUP;
+					break;
+				case BOTTOMUP:
+					if (!growing && nf < n / beta) {
+
+						//We need to prepare the switch back to top down
+						//We couldnt keep track of mu during bottom up - because we dont know what mf is. Computing mu here
+						count_unvisited_edges(	unvisited_queue,
+														size_last_unvisited_queue,
+														visited_bmap,
+														vertex_degree,
+														d_mu,
+														stream);
+
+						//Typical pre-top down workflow. set_frontier_degree + exclusive-scan
+						set_frontier_degree(frontier_vertex_degree, frontier, vertex_degree, nf, stream);
+						exclusive_sum(	d_cub_exclusive_sum_storage,
+											cub_exclusive_sum_storage_bytes,
+											frontier_vertex_degree,
+											exclusive_sum_frontier_vertex_degree,
+											nf + 1,
+											stream);
+
+						cudaMemcpyAsync(	&mf,
+												&exclusive_sum_frontier_vertex_degree[nf],
+												sizeof(IndexType),
+												cudaMemcpyDeviceToHost,
+												stream);
+						cudaCheckError()
+						;
+
+						cudaMemcpyAsync(&mu, d_mu, sizeof(IndexType), cudaMemcpyDeviceToHost, stream);
+						cudaCheckError()
+						;
+
+						//We will need mf and mu
+						cudaStreamSynchronize(stream);
+						cudaCheckError()
+						;
+
+						algo_state = TOPDOWN;
+					}
+					break;
+				}
+			}
+
+			//Executing algo
+
+			switch (algo_state) {
+			case TOPDOWN:
+				compute_bucket_offsets(	exclusive_sum_frontier_vertex_degree,
+												exclusive_sum_frontier_vertex_buckets_offsets,
+												nf,
+												mf,
+												stream);
+				frontier_expand(	row_offsets,
+										col_indices,
+										frontier,
+										nf,
+										mf,
+										lvl,
+										new_frontier,
+										d_new_frontier_cnt,
+										exclusive_sum_frontier_vertex_degree,
+										exclusive_sum_frontier_vertex_buckets_offsets,
+										visited_bmap,
+										distances,
+										predecessors,
+										edge_mask,
+										isolated_bmap,
+										directed,
+										stream,
+										deterministic);
+
+				mu -= mf;
+
+				cudaMemcpyAsync(	&nf,
+										d_new_frontier_cnt,
+										sizeof(IndexType),
+										cudaMemcpyDeviceToHost,
+										stream);
+				cudaCheckError();
+
+				//We need nf
+				cudaStreamSynchronize(stream);
+				cudaCheckError();
+
+				if (nf) {
+
+					//Typical pre-top down workflow. set_frontier_degree + exclusive-scan
+					set_frontier_degree(frontier_vertex_degree, new_frontier, vertex_degree, nf, stream);
+					exclusive_sum(	d_cub_exclusive_sum_storage,
+										cub_exclusive_sum_storage_bytes,
+										frontier_vertex_degree,
+										exclusive_sum_frontier_vertex_degree,
+										nf + 1,
+										stream);
+					cudaMemcpyAsync(	&mf,
+											&exclusive_sum_frontier_vertex_degree[nf],
+											sizeof(IndexType),
+											cudaMemcpyDeviceToHost,
+											stream);
+					cudaCheckError()
+					;
+
+					//We need mf
+					cudaStreamSynchronize(stream);
+					cudaCheckError()
+					;
+				}
+				break;
+
+			case BOTTOMUP:
+				fill_unvisited_queue(visited_bmap,
+											vertices_bmap_size,
+											n,
+											unvisited_queue,
+											d_unvisited_cnt,
+											stream,
+											deterministic);
+
+				size_last_unvisited_queue = nu;
+
+				bottom_up_main(unvisited_queue,
+									size_last_unvisited_queue,
+									left_unvisited_queue,
+									d_left_unvisited_cnt,
+									visited_bmap,
+									row_offsets,
+									col_indices,
+									lvl,
+									new_frontier,
+									d_new_frontier_cnt,
+									distances,
+									predecessors,
+									edge_mask,
+									stream,
+									deterministic);
+
+				//The number of vertices left unvisited decreases
+				//If it wasnt necessary last time, it wont be this time
+				if (size_last_left_unvisited_queue) {
+					cudaMemcpyAsync(	&size_last_left_unvisited_queue,
+											d_left_unvisited_cnt,
+											sizeof(IndexType),
+											cudaMemcpyDeviceToHost,
+											stream);
+					cudaCheckError()
+					;
+					//We need last_left_unvisited_size
+					cudaStreamSynchronize(stream);
+					cudaCheckError()
+					;
+					bottom_up_large(	left_unvisited_queue,
+											size_last_left_unvisited_queue,
+											visited_bmap,
+											row_offsets,
+											col_indices,
+											lvl,
+											new_frontier,
+											d_new_frontier_cnt,
+											distances,
+											predecessors,
+											edge_mask,
+											stream,
+											deterministic);
+				}
+				cudaMemcpyAsync(	&nf,
+										d_new_frontier_cnt,
+										sizeof(IndexType),
+										cudaMemcpyDeviceToHost,
+										stream);
+				cudaCheckError()
+				;
+
+				//We will need nf
+				cudaStreamSynchronize(stream);
+				cudaCheckError()
+				;
+
+				break;
+			}
+
+			//Updating undiscovered edges count
+			nu -= nf;
+
+			//Using new frontier
+			frontier = new_frontier;
+			growing = (nf > old_nf);
+
+			++lvl;
+		}
+
+		cudaCheckError()
+		;
+		return NVGRAPH_OK;
+	}
+
+	//Just used for benchmarks now
+	template<typename IndexType>
+	NVGRAPH_ERROR Bfs<IndexType>::traverse(IndexType *source_vertices, IndexType nsources) {
+		for (IndexType i = 0; i < nsources; ++i)
+			traverse(source_vertices[i]);
+
+		return NVGRAPH_OK;
+	}
+
+	template<typename IndexType>
+	void Bfs<IndexType>::resetDevicePointers() {
+		cudaMemsetAsync(d_counters_pad, 0, 4 * sizeof(IndexType), stream);
+		cudaCheckError()
+		;
+	}
+
+	template<typename IndexType>
+	void Bfs<IndexType>::clean() {
+		cudaCheckError()
+		;
+
+		//the vectors have a destructor that takes care of cleaning
+		cudaFree(original_frontier);
+		cudaFree(visited_bmap);
+		cudaFree(isolated_bmap);
+		cudaFree(vertex_degree);
+		cudaFree(d_cub_exclusive_sum_storage);
+		cudaFree(buffer_np1_1);
+		cudaFree(buffer_np1_2);
+		cudaFree(exclusive_sum_frontier_vertex_buckets_offsets);
+		cudaFree(d_counters_pad);
+
+		//In that case, distances is a working data
+		if (directed && !computeDistances)
+			cudaFree(distances);
+
+		cudaCheckError()
+		;
+	}
+
+	template class Bfs<int> ;
+} // end namespace nvgraph
diff --git a/cpp/nvgraph/cpp/src/bfs2d.cu b/cpp/nvgraph/cpp/src/bfs2d.cu
new file mode 100644
index 00000000000..2dc288bb0a6
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/bfs2d.cu
@@ -0,0 +1,397 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "bfs2d.hxx"
+#include "bfs2d_kernels.cuh"
+#include "debug_help.h"
+
+namespace nvgraph {
+	using namespace bfs_kernels;
+	template<typename GlobalType, typename LocalType, typename ValueType>
+	NVGRAPH_ERROR Bfs2d<GlobalType, LocalType, ValueType>::setup() {
+		// Setup the frontier and visited bitmaps
+		int32_t offset = M->getMatrixDecompositionDescription().getOffset();
+		int32_t bitmap_n = (offset + 31) / 32;
+		const MatrixDecompositionDescription<GlobalType, LocalType>* descr;
+		descr = &(M->getMatrixDecompositionDescription());
+		frontier_bmap = new VertexData2D<GlobalType, LocalType, int32_t>(descr, bitmap_n);
+		visited_bmap = new VertexData2D<GlobalType, LocalType, int32_t>(descr, bitmap_n);
+
+		// Setup frontier and frontierSize
+		frontier = new VertexData2D_Unbuffered<GlobalType, LocalType, LocalType>(descr);
+		trim_frontier = new VertexData2D_Unbuffered<GlobalType, LocalType, LocalType>(descr);
+		frontierSize = new VertexData2D_Unbuffered<GlobalType, LocalType, LocalType>(descr, 1);
+		frontierSize_h.resize(descr->getNumBlocks());
+		frontierDegree_h.resize(descr->getNumBlocks());
+		degreeFlags = new VertexData2D_Unbuffered<GlobalType, LocalType, int8_t>(descr);
+
+		// Setup the 2d distances and predecessors
+		distances = new VertexData2D<GlobalType, LocalType, int32_t>(descr);
+		predecessors = new VertexData2D<GlobalType, LocalType, GlobalType>(descr);
+
+		// Setup degree exclusive sum and cub storage space
+		LocalType n_exSum = offset + 1;
+		size_t temp_bytes = getCubExclusiveSumStorageSize(n_exSum);
+		size_t temp_bytes_compact = getCubSelectFlaggedStorageSize(n_exSum - 1);
+		if (temp_bytes_compact > temp_bytes)
+			temp_bytes = temp_bytes_compact;
+		exSumStorage = new VertexData2D_Unbuffered<GlobalType, LocalType, int8_t>(descr, temp_bytes);
+		exSumDegree = new VertexData2D_Unbuffered<GlobalType, LocalType, LocalType>(descr,
+																												offset + 1);
+
+		// Setup bucketOffsets. Size is based on nnz, so we find the largest nnz over all blocks and use that.
+		int32_t numBlocks = descr->getNumBlocks();
+		size_t blockNnz = 0;
+		for (int32_t i = 0; i < numBlocks; i++) {
+			MultiValuedCsrGraph<LocalType, ValueType>* block = M->getBlockMatrix(i);
+			blockNnz = max(block->get_num_edges(), blockNnz);
+		}
+		size_t bucketAllocSize = ((blockNnz / TOP_DOWN_EXPAND_DIMX + 1) * NBUCKETS_PER_BLOCK + 2);
+		bucketOffsets =
+				new VertexData2D_Unbuffered<GlobalType, LocalType, LocalType>(descr, bucketAllocSize);
+		// Size bucketOffsets based on blockNnz
+
+		return NVGRAPH_OK;
+	}
+
+	template<typename GlobalType, typename LocalType, typename ValueType>
+	NVGRAPH_ERROR Bfs2d<GlobalType, LocalType, ValueType>::configure(GlobalType *_distances,
+																							GlobalType *_predecessors) {
+		// Set the output locations.
+		distances_out = _distances;
+		predecessors_out = _predecessors;
+
+		return NVGRAPH_OK;
+	}
+
+	template<typename GlobalType, typename LocalType, typename ValueType>
+	void Bfs2d<GlobalType, LocalType, ValueType>::clean() {
+		// Delete allocated data:
+		if (distances)
+			delete distances;
+		if (predecessors)
+			delete predecessors;
+		if (frontier_bmap)
+			delete frontier_bmap;
+		if (visited_bmap)
+			delete visited_bmap;
+		if (frontier)
+			delete frontier;
+		if (trim_frontier)
+			delete trim_frontier;
+		if (frontierSize)
+			delete frontierSize;
+		if (exSumDegree)
+			delete exSumDegree;
+		if (exSumStorage)
+			delete exSumStorage;
+		if (bucketOffsets)
+			delete bucketOffsets;
+		if (degreeFlags)
+			delete degreeFlags;
+	}
+
+	template<typename GlobalType, typename LocalType, typename ValueType>
+	NVGRAPH_ERROR Bfs2d<GlobalType, LocalType, ValueType>::traverse(GlobalType source_vertex) {
+		// Setup and get references for things
+		const MatrixDecompositionDescription<GlobalType, LocalType>& description =
+				M->getMatrixDecompositionDescription();
+		const std::vector<int32_t>& deviceAssignments = description.getDeviceAssignments();
+		const std::vector<cudaStream_t>& blockStreams = description.getBlockStreams();
+		int32_t numBlocks = description.getNumBlocks();
+		LocalType offset = description.getOffset();
+		int32_t current_device;
+		cudaGetDevice(&current_device);
+
+		// Initialize the frontier bitmap with the source vertex set
+		frontier_bmap->fillElements(0);
+		LocalType blockRow = source_vertex / offset;
+		LocalType blockOffset = source_vertex % offset;
+		LocalType intId = blockOffset / 32;
+		LocalType bitOffset = blockOffset % 32;
+		int32_t bmapElement = 1 << bitOffset;
+		int32_t bId = description.getBlockId(blockRow, blockRow);
+		int32_t* copyTo = frontier_bmap->getCurrent(bId) + intId;
+		cudaMemcpy(copyTo, &bmapElement, sizeof(int32_t), cudaMemcpyDefault);
+		frontier_bmap->rowScatter();
+
+		// Initialize frontierSizes to zero
+		frontierSize->fillElements(0);
+		frontierSize->rowScatter();
+
+		// Initialize the visited bitmap with the source vertex set
+		frontier_bmap->copyTo(visited_bmap);
+		visited_bmap->columnScatter();
+
+		// Initialize the distances and predecessors
+		distances->fillElements((LocalType) -1);
+		distances->setElement(source_vertex, (LocalType) 0);
+		distances->columnScatter();
+		predecessors->fillElements((GlobalType) -1);
+		predecessors->columnScatter();
+
+		// Setup initial frontier from bitmap frontier
+		for (int i = 0; i < numBlocks; i++) {
+			cudaStream_t stream = blockStreams[i];
+			int32_t device = deviceAssignments[i];
+			cudaSetDevice(device);
+			convert_bitmap_to_queue(frontier_bmap->getCurrent(i),
+											frontier_bmap->getN(),
+											offset,
+											frontier->get(i),
+											frontierSize->get(i),
+											stream);
+			cudaMemcpyAsync(&frontierSize_h[i],
+									frontierSize->get(i),
+									sizeof(LocalType),
+									cudaMemcpyDefault,
+									stream);
+		}
+		description.syncAllStreams();
+
+		// Main iteration loop
+		int32_t globalSources = 1;
+		LocalType level = 1;
+		while (globalSources > 0) {
+
+//			std::cout << "Starting with level " << level << "\n";
+
+			// Remove frontier nodes with locally zero degree
+			for (int i = 0; i < numBlocks; i++) {
+				// Checking that there is work to be done for this block
+				if (frontierSize_h[i] > 0) {
+					// Write out the degree of each frontier node into exSumDegree
+					degreeIterator<LocalType> degreeIt(M->getBlockMatrix(i)->get_raw_row_offsets());
+					cudaStream_t stream = blockStreams[i];
+					cudaSetDevice(deviceAssignments[i]);
+					set_degree_flags(	degreeFlags->get(i),
+											frontier->get(i),
+											degreeIt,
+											frontierSize_h[i],
+											stream);
+//					set_frontier_degree(exSumDegree->get(i),
+//												frontier->get(i),
+//												degreeIt,
+//												frontierSize_h[i],
+//												stream);
+//
+//					cudaStreamSynchronize(stream);
+//					std::cout << "Block " << i << " before compaction.\n";
+//					debug::printDeviceVector(frontier->get(i), frontierSize_h[i], "Frontier");
+//					debug::printDeviceVector(exSumDegree->get(i), frontierSize_h[i], "Frontier Degree");
+
+					// Use degreeIterator as flags to compact the frontier
+					cudaSetDevice(deviceAssignments[i]);
+					size_t numBytes = exSumStorage->getN();
+					cub::DeviceSelect::Flagged(exSumStorage->get(i),
+														numBytes,
+														frontier->get(i),
+														degreeFlags->get(i),
+														trim_frontier->get(i),
+														frontierSize->get(i),
+														frontierSize_h[i],
+														stream);
+					cudaMemcpyAsync(&frontierSize_h[i],
+											frontierSize->get(i),
+											sizeof(LocalType),
+											cudaMemcpyDefault,
+											stream);
+				}
+			}
+			description.syncAllStreams();
+
+			// Setup load balancing for main kernel call
+			for (int i = 0; i < numBlocks; i++) {
+				// Checking that there is work to be done for this block:
+				if (frontierSize_h[i] > 0) {
+					// Write out the degree of each frontier node into exSumDegree
+					degreeIterator<LocalType> degreeIt(M->getBlockMatrix(i)->get_raw_row_offsets());
+					cudaStream_t stream = blockStreams[i];
+					cudaSetDevice(deviceAssignments[i]);
+					set_frontier_degree(exSumDegree->get(i),
+												trim_frontier->get(i),
+												degreeIt,
+												frontierSize_h[i],
+												stream);
+
+//					cudaStreamSynchronize(stream);
+//					std::cout << "Block " << i << " after compaction.\n";
+//					debug::printDeviceVector(trim_frontier->get(i), frontierSize_h[i], "Frontier");
+//					debug::printDeviceVector(exSumDegree->get(i), frontierSize_h[i], "Frontier Degree");
+
+					// Get the exclusive sum of the frontier degrees, store in exSumDegree
+					size_t numBytes = exSumStorage->getN();
+					cub::DeviceScan::ExclusiveSum(exSumStorage->get(i),
+															numBytes,
+															exSumDegree->get(i),
+															exSumDegree->get(i),
+															frontierSize_h[i] + 1,
+															stream);
+					cudaMemcpyAsync(&frontierDegree_h[i],
+											exSumDegree->get(i) + frontierSize_h[i],
+											sizeof(LocalType),
+											cudaMemcpyDefault,
+											stream);
+				}
+			}
+			description.syncAllStreams();
+
+//			for (int i = 0; i < numBlocks; i++) {
+//				std::cout << "Block " << i << " frontierNodes " << frontierSize_h[i]
+//						<< " frontierDegree " << frontierDegree_h[i] << "\n";
+//			}
+
+			for (int i = 0; i < numBlocks; i++) {
+				// Checking that there is work to be done for this block:
+				if (frontierSize_h[i] > 0) {
+					cudaStream_t stream = blockStreams[i];
+					cudaSetDevice(deviceAssignments[i]);
+					compute_bucket_offsets(exSumDegree->get(i),
+													bucketOffsets->get(i),
+													frontierSize_h[i],
+													frontierDegree_h[i],
+													stream);
+				}
+			}
+
+			// Call main kernel to get new frontier
+			frontier_bmap->fillElements(0);
+			frontier_bmap->rowScatter();
+			for (int i = 0; i < numBlocks; i++) {
+				// Checking that there is work to be done for this block:
+				if (frontierDegree_h[i] > 0) {
+					cudaSetDevice(deviceAssignments[i]);
+					frontier_expand(M->getBlockMatrix(i)->get_raw_row_offsets(),
+											M->getBlockMatrix(i)->get_raw_column_indices(),
+											trim_frontier->get(i),
+											frontierSize_h[i],
+											frontierDegree_h[i],
+											level,
+											frontier_bmap->getCurrent(i),
+											exSumDegree->get(i),
+											bucketOffsets->get(i),
+											visited_bmap->getCurrent(i),
+											distances->getCurrent(i),
+											predecessors->getCurrent(i),
+											blockStreams[i]);
+
+//					cudaStreamSynchronize(blockStreams[i]);
+//					int bitsSet =
+//							thrust::reduce(thrust::device,
+//												thrust::make_transform_iterator(frontier_bmap->getCurrent(i),
+//																							popCount()),
+//												thrust::make_transform_iterator(frontier_bmap->getCurrent(i)
+//																									+ frontier_bmap->getN(),
+//																							popCount()));
+//					std::cout << "Block " << i << " Level " << level << " has " << bitsSet << " bits set\n";
+				}
+			}
+			description.syncAllStreams();
+
+			// Update and propogate new frontier and visited bitmaps
+			frontier_bmap->template columnReduce<BitwiseOr>();
+			frontier_bmap->rowScatter();
+			visited_bmap->template columnReduce<BitwiseOr>();
+			visited_bmap->columnScatter();
+
+			// Convert bitmap frontier to list frontier and update globalSources
+			frontierSize->fillElements(0);
+			frontierSize->rowScatter();
+			for (int i = 0; i < numBlocks; i++) {
+				cudaStream_t stream = blockStreams[i];
+				int32_t device = deviceAssignments[i];
+				cudaSetDevice(device);
+				convert_bitmap_to_queue(frontier_bmap->getCurrent(i),
+												frontier_bmap->getN(),
+												offset,
+												frontier->get(i),
+												frontierSize->get(i),
+												stream);
+				cudaMemcpyAsync(&frontierSize_h[i],
+										frontierSize->get(i),
+										sizeof(LocalType),
+										cudaMemcpyDefault,
+										stream);
+			}
+			description.syncAllStreams();
+			GlobalType blockRows = description.getBlockRows();
+			globalSources = 0;
+			for (int i = 0; i < blockRows; i++) {
+				int32_t bId = description.getBlockId(i, i);
+				globalSources += frontierSize_h[bId];
+			}
+
+//			std::cout << "Finished with level " << level << " frontiers:\n";
+//			for (int i = 0; i < numBlocks; i++)
+//				std::cout << "\tBlock " << i << " : " << frontierSize_h[i] << "\n";
+
+			// Increment level
+			level++;
+		}
+
+		// Globalize the predecessors by row
+		for (int i = 0; i < numBlocks; i++) {
+			cudaStream_t stream = blockStreams[i];
+			int32_t device = deviceAssignments[i];
+			cudaSetDevice(device);
+			int32_t rowId = description.getBlockRow(i);
+			GlobalType globalOffset = rowId * description.getOffset();
+			globalize_ids(predecessors->getCurrent(i),
+								globalOffset,
+								(GlobalType) predecessors->getN(),
+								stream);
+		}
+		description.syncAllStreams();
+
+		// Propogate predecessors and distances
+		predecessors->template columnReduce<predMerge>();
+		distances->template columnReduce<predMerge>();
+
+		// Copy out predecessors and distances to user provided locations
+		LocalType* temp = (LocalType*) malloc(distances->getN() * sizeof(LocalType));
+		int32_t writeOffset = 0;
+		int32_t numRows = description.getNumRows();
+		int32_t blockRows = description.getBlockRows();
+		for (int i = 0; i < blockRows; i++) {
+			// Copy out the data for the block on the diagonal
+			int32_t bId = description.getBlockId(i, i);
+			int32_t n = predecessors->getN();
+			cudaMemcpy(temp, predecessors->getCurrent(bId), n * sizeof(LocalType), cudaMemcpyDefault);
+			for (int j = 0; j < n; j++) {
+				if (writeOffset + j < numRows)
+					predecessors_out[writeOffset + j] = temp[j];
+			}
+			cudaMemcpy(temp, distances->getCurrent(bId), n * sizeof(LocalType), cudaMemcpyDefault);
+			for (int j = 0; j < n; j++) {
+				if (writeOffset + j < numRows)
+					distances_out[writeOffset + j] = temp[j];
+			}
+			writeOffset += n;
+		}
+
+		return NVGRAPH_OK;
+	}
+
+	template<typename GlobalType, typename LocalType, typename ValueType>
+	NVGRAPH_ERROR Bfs2d<GlobalType, LocalType, ValueType>::traverse(GlobalType *source_vertices,
+																							int32_t nsources) {
+		for (int32_t i = 0; i < nsources; i++) {
+			traverse(source_vertices[i]);
+		}
+		return NVGRAPH_OK;
+	}
+
+	template class Bfs2d<int, int, int> ;
+}
diff --git a/cpp/nvgraph/cpp/src/bfs_kernels.cu b/cpp/nvgraph/cpp/src/bfs_kernels.cu
new file mode 100644
index 00000000000..594e2b980ca
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/bfs_kernels.cu
@@ -0,0 +1,1580 @@
+
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <iostream>
+
+#include <sm_utils.h>
+#include <cub/cub.cuh>
+#include <nvgraph_error.hxx>
+
+#define MAXBLOCKS 65535
+#define WARP_SIZE 32
+#define INT_SIZE 32
+
+//
+// Bottom up macros
+//
+
+#define FILL_UNVISITED_QUEUE_DIMX 256
+
+#define COUNT_UNVISITED_EDGES_DIMX 256
+
+#define MAIN_BOTTOMUP_DIMX 256
+#define MAIN_BOTTOMUP_NWARPS (MAIN_BOTTOMUP_DIMX/WARP_SIZE)
+
+#define LARGE_BOTTOMUP_DIMX 256
+
+//Number of edges processed in the main bottom up kernel
+#define MAIN_BOTTOMUP_MAX_EDGES 6
+
+//Power of 2 < 32 (strict <)
+#define BOTTOM_UP_LOGICAL_WARP_SIZE 4
+
+//
+// Top down macros
+//
+
+// We will precompute the results the binsearch_maxle every TOP_DOWN_BUCKET_SIZE edges
+#define TOP_DOWN_BUCKET_SIZE 32
+
+// DimX of the kernel
+#define TOP_DOWN_EXPAND_DIMX 256
+
+// TOP_DOWN_EXPAND_DIMX edges -> NBUCKETS_PER_BLOCK buckets
+#define NBUCKETS_PER_BLOCK (TOP_DOWN_EXPAND_DIMX/TOP_DOWN_BUCKET_SIZE)
+
+// How many items_per_thread we can process with one bucket_offset loading
+// the -1 is here because we need the +1 offset
+#define MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD (TOP_DOWN_BUCKET_SIZE - 1)
+
+// instruction parallelism
+// for how many edges will we create instruction parallelism
+#define TOP_DOWN_BATCH_SIZE 2
+
+#define COMPUTE_BUCKET_OFFSETS_DIMX 512
+
+//Other macros
+
+#define FLAG_ISOLATED_VERTICES_DIMX 128
+
+//Number of vertices handled by one thread
+//Must be power of 2, lower than 32
+#define FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD 4 
+
+//Number of threads involved in the "construction" of one int in the bitset
+#define FLAG_ISOLATED_VERTICES_THREADS_PER_INT (INT_SIZE/FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD)
+
+//
+// Parameters of the heuristic to switch between bottomup/topdown
+//Finite machine described in http://parlab.eecs.berkeley.edu/sites/all/parlab/files/main.pdf 
+//
+
+using namespace nvgraph;
+
+namespace bfs_kernels {
+	//
+	// gives the equivalent vectors from a type
+	// for the max val, would be better to use numeric_limits<>::max() once
+	// cpp11 is allowed in nvgraph
+	//
+
+	template<typename >
+	struct vec_t {
+		typedef int4 vec4;
+		typedef int2 vec2;
+	};
+
+	template<>
+	struct vec_t<int> {
+		typedef int4 vec4;
+		typedef int2 vec2;
+		static const int max = INT_MAX;
+	};
+
+	template<>
+	struct vec_t<long long int> {
+		typedef longlong4 vec4;
+		typedef longlong2 vec2;
+		static const long long int max = LLONG_MAX;
+	};
+
+	//
+	// ------------------------- Helper device functions -------------------
+	//
+
+	__forceinline__ __device__ int getMaskNRightmostBitSet(int n) {
+		if (n == INT_SIZE)
+			return (~0);
+		int mask = (1 << n) - 1;
+		return mask;
+	}
+
+	__forceinline__ __device__ int getMaskNLeftmostBitSet(int n) {
+		if (n == 0)
+			return 0;
+		int mask = ~((1 << (INT_SIZE - n)) - 1);
+		return mask;
+	}
+
+	__forceinline__ __device__ int getNextZeroBit(int& val) {
+		int ibit = __ffs(~val) - 1;
+		val |= (1 << ibit);
+
+		return ibit;
+	}
+
+	struct BitwiseAnd
+	{
+		template<typename T>
+		__host__  __device__  __forceinline__ T operator()(const T &a, const T &b) const
+																			{
+			return (a & b);
+		}
+	};
+
+	struct BitwiseOr
+	{
+		template<typename T>
+		__host__  __device__  __forceinline__ T operator()(const T &a, const T &b) const
+																			{
+			return (a | b);
+		}
+	};
+
+	template<typename IndexType>
+	__device__ IndexType binsearch_maxle(	const IndexType *vec,
+														const IndexType val,
+														IndexType low,
+														IndexType high) {
+		while (true) {
+			if (low == high)
+				return low; //we know it exists
+			if ((low + 1) == high)
+				return (vec[high] <= val) ? high : low;
+
+			IndexType mid = low + (high - low) / 2;
+
+			if (vec[mid] > val)
+				high = mid - 1;
+			else
+				low = mid;
+
+		}
+	}
+
+	//
+	//  -------------------------  Bottom up -------------------------
+	//
+
+	//
+	// fill_unvisited_queue_kernel
+	//
+	// Finding unvisited vertices in the visited_bmap, and putting them in the queue
+	// Vertices represented by the same int in the bitmap are adjacent in the queue, and sorted
+	// For instance, the queue can look like this :
+	// 34 38 45 58 61 4 18 24 29 71 84 85 90
+	// Because they are represented by those ints in the bitmap :
+	// [34 38 45 58 61] [4 18 24 29] [71 84 85 90]
+
+	//visited_bmap_nints = the visited_bmap is made of that number of ints
+
+	template<typename IndexType>
+	__global__ void fill_unvisited_queue_kernel(	int *visited_bmap,
+																IndexType visited_bmap_nints,
+																IndexType n,
+																IndexType *unvisited,
+																IndexType *unvisited_cnt) {
+		typedef cub::BlockScan<int, FILL_UNVISITED_QUEUE_DIMX> BlockScan;
+		__shared__ typename BlockScan::TempStorage scan_temp_storage;
+
+		//When filling the "unvisited" queue, we use "unvisited_cnt" to know where to write in the queue (equivalent of int off = atomicAddd(unvisited_cnt, 1) )
+		//We will actually do only one atomicAdd per block - we first do a scan, then call one atomicAdd, and store the common offset for the block in
+		//unvisited_common_block_offset
+		__shared__ IndexType unvisited_common_block_offset;
+
+		//We don't want threads divergence in the loop (we're going to call __syncthreads)
+		//Using a block-only dependent in the condition of the loop
+		for (IndexType block_v_idx = blockIdx.x * blockDim.x;
+				block_v_idx < visited_bmap_nints;
+				block_v_idx += blockDim.x * gridDim.x) {
+
+			//Index of visited_bmap that this thread will compute
+			IndexType v_idx = block_v_idx + threadIdx.x;
+
+			int thread_visited_int = (v_idx < visited_bmap_nints)
+												? visited_bmap[v_idx]
+													:
+													(~0); //will be neutral in the next lines (virtual vertices all visited)
+
+			//The last int can only be partially valid
+			//If we are indeed taking care of the last visited int in this thread,
+			//We need to first disable (ie set as "visited") the inactive bits (vertices >= n)
+			if (v_idx == (visited_bmap_nints - 1)) {
+				int active_bits = n - (INT_SIZE * v_idx);
+				int inactive_bits = INT_SIZE - active_bits;
+				int mask = getMaskNLeftmostBitSet(inactive_bits);
+				thread_visited_int |= mask; //Setting inactive bits as visited
+			}
+
+			//Counting number of unvisited vertices represented by this int
+			int n_unvisited_in_int = __popc(~thread_visited_int);
+			int unvisited_thread_offset;
+
+			//We will need to write n_unvisited_in_int unvisited vertices to the unvisited queue
+			//We ask for that space when computing the block scan, that will tell where to write those
+			//vertices in the queue, using the common offset of the block (see below)
+			BlockScan(scan_temp_storage).ExclusiveSum(n_unvisited_in_int, unvisited_thread_offset);
+
+			//Last thread knows how many vertices will be written to the queue by this block
+			//Asking for that space in the queue using the global count, and saving the common offset
+			if (threadIdx.x == (FILL_UNVISITED_QUEUE_DIMX - 1)) {
+				IndexType total = unvisited_thread_offset + n_unvisited_in_int;
+				unvisited_common_block_offset = atomicAdd(unvisited_cnt, total);
+			}
+
+			//syncthreads for two reasons : 
+			// - we need to broadcast unvisited_common_block_offset
+			// - we will reuse scan_temp_storage (cf CUB doc)
+			__syncthreads();
+
+			IndexType current_unvisited_index = unvisited_common_block_offset
+					+ unvisited_thread_offset;
+			int nvertices_to_write = n_unvisited_in_int;
+
+			// getNextZeroBit uses __ffs, which gives least significant bit set
+			// which means that as long as n_unvisited_in_int is valid,
+			// we will use valid bits
+
+			while (nvertices_to_write > 0) {
+				if (nvertices_to_write >= 4 && (current_unvisited_index % 4) == 0) {
+					typename vec_t<IndexType>::vec4 vec_v;
+
+					vec_v.x = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
+					vec_v.y = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
+					vec_v.z = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
+					vec_v.w = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
+
+					typename vec_t<IndexType>::vec4 *unvisited_i4 = reinterpret_cast<typename vec_t<
+							IndexType>::vec4*>(&unvisited[current_unvisited_index]);
+					*unvisited_i4 = vec_v;
+
+					current_unvisited_index += 4;
+					nvertices_to_write -= 4;
+				}
+				else if (nvertices_to_write >= 2 && (current_unvisited_index % 2) == 0) {
+					typename vec_t<IndexType>::vec2 vec_v;
+
+					vec_v.x = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
+					vec_v.y = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
+
+					typename vec_t<IndexType>::vec2 *unvisited_i2 = reinterpret_cast<typename vec_t<
+							IndexType>::vec2*>(&unvisited[current_unvisited_index]);
+					*unvisited_i2 = vec_v;
+
+					current_unvisited_index += 2;
+					nvertices_to_write -= 2;
+				} else {
+					IndexType v = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
+
+					unvisited[current_unvisited_index] = v;
+
+					current_unvisited_index += 1;
+					nvertices_to_write -= 1;
+				}
+
+			}
+		}
+	}
+
+	//Wrapper
+	template<typename IndexType>
+	void fill_unvisited_queue(	int *visited_bmap,
+										IndexType visited_bmap_nints,
+										IndexType n,
+										IndexType *unvisited,
+										IndexType *unvisited_cnt,
+										cudaStream_t m_stream,
+										bool deterministic) {
+		dim3 grid, block;
+		block.x = FILL_UNVISITED_QUEUE_DIMX;
+
+		grid.x = min((IndexType) MAXBLOCKS, (visited_bmap_nints + block.x - 1) / block.x);
+
+		fill_unvisited_queue_kernel<<<grid, block, 0, m_stream>>>(	visited_bmap,
+																						visited_bmap_nints,
+																						n,
+																						unvisited,
+																						unvisited_cnt);
+		cudaCheckError()
+		;
+	}
+
+	//
+	// count_unvisited_edges_kernel
+	// Couting the total number of unvisited edges in the graph - using an potentially unvisited queue
+	// We need the current unvisited vertices to be in the unvisited queue
+	// But visited vertices can be in the potentially_unvisited queue
+	// We first check if the vertex is still unvisited before using it
+	// Useful when switching from "Bottom up" to "Top down"
+	//
+
+	template<typename IndexType>
+	__global__ void count_unvisited_edges_kernel(const IndexType *potentially_unvisited,
+																const IndexType potentially_unvisited_size,
+																const int *visited_bmap,
+																IndexType *degree_vertices,
+																IndexType *mu) {
+		typedef cub::BlockReduce<IndexType, COUNT_UNVISITED_EDGES_DIMX> BlockReduce;
+		__shared__ typename BlockReduce::TempStorage reduce_temp_storage;
+
+		//number of undiscovered edges counted by this thread
+		IndexType thread_unvisited_edges_count = 0;
+
+		for (IndexType idx = blockIdx.x * blockDim.x + threadIdx.x;
+				idx < potentially_unvisited_size;
+				idx += blockDim.x * gridDim.x) {
+
+			IndexType u = potentially_unvisited[idx];
+			int u_visited_bmap = visited_bmap[u / INT_SIZE];
+			int is_visited = u_visited_bmap & (1 << (u % INT_SIZE));
+
+			if (!is_visited)
+				thread_unvisited_edges_count += degree_vertices[u];
+
+		}
+
+		//We need all thread_unvisited_edges_count to be ready before reducing
+		__syncthreads();
+
+		IndexType block_unvisited_edges_count =
+				BlockReduce(reduce_temp_storage).Sum(thread_unvisited_edges_count);
+
+		//block_unvisited_edges_count is only defined is th.x == 0
+		if (threadIdx.x == 0)
+			atomicAdd(mu, block_unvisited_edges_count);
+	}
+
+	//Wrapper
+	template<typename IndexType>
+	void count_unvisited_edges(const IndexType *potentially_unvisited,
+										const IndexType potentially_unvisited_size,
+										const int *visited_bmap,
+										IndexType *node_degree,
+										IndexType *mu,
+										cudaStream_t m_stream) {
+		dim3 grid, block;
+		block.x = COUNT_UNVISITED_EDGES_DIMX;
+		grid.x = min((IndexType) MAXBLOCKS, (potentially_unvisited_size + block.x - 1) / block.x);
+
+		count_unvisited_edges_kernel<<<grid, block, 0, m_stream>>>(	potentially_unvisited,
+																						potentially_unvisited_size,
+																						visited_bmap,
+																						node_degree,
+																						mu);
+		cudaCheckError()
+		;
+	}
+
+	//
+	// Main Bottom Up kernel
+	// Here we will start to process unvisited vertices in the unvisited queue
+	// We will only consider the first MAIN_BOTTOMUP_MAX_EDGES edges
+	// If it's not possible to define a valid parent using only those edges,
+	// add it to the "left_unvisited_queue"
+	//
+
+	//
+	// We will use the "vertices represented by the same int in the visited bmap are adjacents and sorted in the unvisited queue" property
+	// It is used to do a reduction locally and fully build the new visited_bmap
+	//
+
+	template<typename IndexType>
+	__global__ void main_bottomup_kernel(	const IndexType *unvisited,
+														const IndexType unvisited_size,
+														IndexType *left_unvisited,
+														IndexType *left_unvisited_cnt,
+														int *visited_bmap,
+														const IndexType *row_ptr,
+														const IndexType *col_ind,
+														IndexType lvl,
+														IndexType *new_frontier,
+														IndexType *new_frontier_cnt,
+														IndexType *distances,
+														IndexType *predecessors,
+														int *edge_mask) {
+		typedef cub::BlockDiscontinuity<IndexType, MAIN_BOTTOMUP_DIMX> BlockDiscontinuity;
+		typedef cub::WarpReduce<int> WarpReduce;
+		typedef cub::BlockScan<int, MAIN_BOTTOMUP_DIMX> BlockScan;
+
+		__shared__ typename BlockDiscontinuity::TempStorage discontinuity_temp_storage;
+		__shared__ typename WarpReduce::TempStorage reduce_temp_storage;
+		__shared__ typename BlockScan::TempStorage scan_temp_storage;
+
+		//To write vertices in the frontier,
+		//We will use a block scan to locally compute the offsets
+		//frontier_common_block_offset contains the common offset for the block
+		__shared__ IndexType frontier_common_block_offset;
+
+		// When building the new visited_bmap, we reduce (using a bitwise and) the visited_bmap ints
+		// from the vertices represented by the same int (for instance vertices 1, 5, 9, 13, 23)
+		// vertices represented by the same int will be designed as part of the same "group"
+		// To detect the deliminations between those groups, we use BlockDiscontinuity
+		// Then we need to create the new "visited_bmap" within those group.
+		// We use a warp reduction that takes into account limits between groups to do it
+		// But a group can be cut in two different warps : in that case, the second warp
+		// put the result of its local reduction in local_visited_bmap_warp_head
+		// the first warp will then read it and finish the reduction
+
+		__shared__ int local_visited_bmap_warp_head[MAIN_BOTTOMUP_NWARPS];
+
+		const int warpid = threadIdx.x / WARP_SIZE;
+		const int laneid = threadIdx.x % WARP_SIZE;
+
+		// we will call __syncthreads inside the loop
+		// we need to keep complete block active
+		for (IndexType block_off = blockIdx.x * blockDim.x;
+				block_off < unvisited_size;
+				block_off += blockDim.x * gridDim.x)
+						{
+			IndexType idx = block_off + threadIdx.x;
+
+			// This thread will take care of unvisited_vertex
+			// in the visited_bmap, it is represented by the int at index
+			// visited_bmap_index = unvisited_vertex/INT_SIZE
+			// it will be used by BlockDiscontinuity
+			// to flag the separation between groups of vertices (vertices represented by different in in visited_bmap)
+			IndexType visited_bmap_index[1]; //this is an array of size 1 because CUB needs one
+			visited_bmap_index[0] = -1;
+			IndexType unvisited_vertex = -1;
+
+			// local_visited_bmap gives info on the visited bit of unvisited_vertex
+			//
+			// By default, everything is visited
+			// This is because we only take care of unvisited vertices here,
+			// The other are by default unvisited
+			// If a vertex remain unvisited, we will notice it here
+			// That's why by default we consider everything visited ( ie ~0 )
+			// If we fail to assign one parent to an unvisited vertex, we will
+			// explicitly unset the bit
+			int local_visited_bmap = (~0);
+			int found = 0;
+			int more_to_visit = 0;
+			IndexType valid_parent;
+			IndexType left_unvisited_off;
+
+			if (idx < unvisited_size)
+					{
+				//Processing first STPV edges of unvisited v
+				//If bigger than that, push to left_unvisited queue
+				unvisited_vertex = unvisited[idx];
+
+				IndexType edge_begin = row_ptr[unvisited_vertex];
+				IndexType edge_end = row_ptr[unvisited_vertex + 1];
+
+				visited_bmap_index[0] = unvisited_vertex / INT_SIZE;
+
+				IndexType degree = edge_end - edge_begin;
+
+				for (IndexType edge = edge_begin;
+						edge < min(edge_end, edge_begin + MAIN_BOTTOMUP_MAX_EDGES); ++edge)
+						{
+					if (edge_mask && !edge_mask[edge])
+						continue;
+
+					IndexType parent_candidate = col_ind[edge];
+
+					if (distances[parent_candidate] == (lvl - 1))
+							{
+						found = 1;
+						valid_parent = parent_candidate;
+						break;
+					}
+				}
+
+				// This vertex will remain unvisited at the end of this kernel
+				// Explicitly say it
+				if (!found)
+					local_visited_bmap &= ~(1 << (unvisited_vertex % INT_SIZE)); //let this one unvisited
+				else
+				{
+					if (distances)
+						distances[unvisited_vertex] = lvl;
+					if (predecessors)
+						predecessors[unvisited_vertex] = valid_parent;
+				}
+
+				//If we haven't found a parent and there's more edge to check
+				if (!found && degree > MAIN_BOTTOMUP_MAX_EDGES)
+				{
+					left_unvisited_off = atomicAdd(left_unvisited_cnt, (IndexType) 1); //TODO scan
+					more_to_visit = 1;
+				}
+
+			}
+
+			//
+			// We will separate vertices in group
+			// Two vertices are in the same group if represented by same int in visited_bmap
+			// ie u and v in same group <=> u/32 == v/32
+			//
+			// We will now flag the head of those group (first element of each group)
+			//
+			// 1) All vertices within the same group are adjacent in the queue (cf fill_unvisited_queue)
+			// 2) A group is of size <= 32, so a warp will contain at least one head, and a group will be contained
+			// at most by two warps
+
+			int is_head_a[1]; //CUB need an array
+			BlockDiscontinuity(discontinuity_temp_storage).FlagHeads(is_head_a,
+																						visited_bmap_index,
+																						cub::Inequality());
+			int is_head = is_head_a[0];
+
+			// Computing the warp reduce within group
+			// This primitive uses the is_head flags to know where the limits of the groups are
+			// We use bitwise and as operator, because of the fact that 1 is the default value
+			// If a vertex is unvisited, we have to explicitly ask for it
+			int local_bmap_agg =
+					WarpReduce(reduce_temp_storage).HeadSegmentedReduce(	local_visited_bmap,
+																							is_head,
+																							BitwiseAnd());
+
+			// We need to take care of the groups cut in two in two different warps
+			// Saving second part of the reduce here, then applying it on the first part bellow
+			// Corner case : if the first thread of the warp is a head, then this group is not cut in two
+			// and then we have to be neutral (for an bitwise and, it's an ~0)
+			if (laneid == 0)
+					{
+				local_visited_bmap_warp_head[warpid] = (is_head) ? (~0) : local_bmap_agg;
+			}
+
+			//broadcasting local_visited_bmap_warp_head
+			__syncthreads();
+
+			int head_ballot = nvgraph::utils::ballot(is_head);
+
+			//As long as idx < unvisited_size, we know there's at least one head per warp
+			int laneid_last_head_in_warp = INT_SIZE - 1 - __clz(head_ballot);
+
+			int is_last_head_in_warp = (laneid == laneid_last_head_in_warp);
+
+			// if laneid == 0 && is_last_head_in_warp, it's a special case where
+			// a group of size 32 starts exactly at lane 0
+			// in that case, nothing to do (this group is not cut by a warp delimitation)
+			// we also have to make sure that a warp actually exists after this one (this corner case is handled after)
+			if (laneid != 0 && is_last_head_in_warp & (warpid + 1) < MAIN_BOTTOMUP_NWARPS)
+			{
+				local_bmap_agg &= local_visited_bmap_warp_head[warpid + 1];
+			}
+
+			//Three cases :
+			// -> This is the first group of the block - it may be cut in two (with previous block)
+			// -> This is the last group of the block - same thing
+			// -> This group is completely contained in this block
+
+			if (warpid == 0 && laneid == 0)
+					{
+				//The first elt of this group considered in this block is unvisited_vertex
+				//We know that's the case because elts are sorted in a group, and we are at laneid == 0
+				//We will do an atomicOr - we have to be neutral about elts < unvisited_vertex
+				int iv = unvisited_vertex % INT_SIZE; // we know that this unvisited_vertex is valid
+				int mask = getMaskNLeftmostBitSet(INT_SIZE - iv);
+				local_bmap_agg &= mask; //we have to be neutral for elts < unvisited_vertex
+				atomicOr(&visited_bmap[unvisited_vertex / INT_SIZE], local_bmap_agg);
+			}
+			else if (warpid == (MAIN_BOTTOMUP_NWARPS - 1) &&
+					laneid >= laneid_last_head_in_warp && // We need the other ones to go in else case
+					idx < unvisited_size //we could be out
+							)
+							{
+				//Last head of the block
+				//We don't know if this group is complete
+
+				//last_v is the last unvisited_vertex of the group IN THIS block
+				//we dont know about the rest - we have to be neutral about elts > last_v
+
+				//the destination thread of the __shfl is active
+				int laneid_max = min((IndexType) (WARP_SIZE - 1),
+											(unvisited_size - (block_off + 32 * warpid)));
+				IndexType last_v = nvgraph::utils::shfl(	unvisited_vertex,
+																		laneid_max,
+																		WARP_SIZE,
+																		__activemask());
+
+				if (is_last_head_in_warp)
+				{
+					int ilast_v = last_v % INT_SIZE + 1;
+					int mask = getMaskNRightmostBitSet(ilast_v);
+					local_bmap_agg &= mask; //we have to be neutral for elts > last_unvisited_vertex
+					atomicOr(&visited_bmap[unvisited_vertex / INT_SIZE], local_bmap_agg);
+				}
+			}
+			else
+			{
+				//group completely in block
+				if (is_head && idx < unvisited_size) {
+					visited_bmap[unvisited_vertex / INT_SIZE] = local_bmap_agg; //no atomics needed, we know everything about this int
+				}
+			}
+
+			//Saving in frontier
+
+			int thread_frontier_offset;
+			BlockScan(scan_temp_storage).ExclusiveSum(found, thread_frontier_offset);
+			IndexType inclusive_sum = thread_frontier_offset + found;
+			if (threadIdx.x == (MAIN_BOTTOMUP_DIMX - 1) && inclusive_sum)
+					{
+				frontier_common_block_offset = atomicAdd(new_frontier_cnt, inclusive_sum);
+			}
+
+			//1) Broadcasting frontier_common_block_offset
+			//2) we want to reuse the *_temp_storage
+			__syncthreads();
+
+			if (found)
+				new_frontier[frontier_common_block_offset + thread_frontier_offset] = unvisited_vertex;
+			if (more_to_visit)
+				left_unvisited[left_unvisited_off] = unvisited_vertex;
+
+		}
+	}
+
+	template<typename IndexType>
+	void bottom_up_main(	IndexType *unvisited,
+								IndexType unvisited_size,
+								IndexType *left_unvisited,
+								IndexType *d_left_unvisited_idx,
+								int *visited,
+								const IndexType *row_ptr,
+								const IndexType *col_ind,
+								IndexType lvl,
+								IndexType *new_frontier,
+								IndexType *new_frontier_idx,
+								IndexType *distances,
+								IndexType *predecessors,
+								int *edge_mask,
+								cudaStream_t m_stream,
+								bool deterministic) {
+		dim3 grid, block;
+		block.x = MAIN_BOTTOMUP_DIMX;
+
+		grid.x = min((IndexType) MAXBLOCKS, ((unvisited_size + block.x - 1)) / block.x);
+
+		main_bottomup_kernel<<<grid, block, 0, m_stream>>>(unvisited,
+																			unvisited_size,
+																			left_unvisited,
+																			d_left_unvisited_idx,
+																			visited,
+																			row_ptr,
+																			col_ind,
+																			lvl,
+																			new_frontier,
+																			new_frontier_idx,
+																			distances,
+																			predecessors,
+																			edge_mask);
+		cudaCheckError()
+		;
+	}
+
+	//
+	// bottom_up_large_degree_kernel
+	// finishing the work started in main_bottomup_kernel for vertex with degree > MAIN_BOTTOMUP_MAX_EDGES && no parent found
+	//
+	template<typename IndexType>
+	__global__ void bottom_up_large_degree_kernel(	IndexType *left_unvisited,
+																	IndexType left_unvisited_size,
+																	int *visited,
+																	const IndexType *row_ptr,
+																	const IndexType *col_ind,
+																	IndexType lvl,
+																	IndexType *new_frontier,
+																	IndexType *new_frontier_cnt,
+																	IndexType *distances,
+																	IndexType *predecessors,
+																	int *edge_mask) {
+
+		int logical_lane_id = threadIdx.x % BOTTOM_UP_LOGICAL_WARP_SIZE;
+		int logical_warp_id = threadIdx.x / BOTTOM_UP_LOGICAL_WARP_SIZE;
+		int logical_warps_per_block = blockDim.x / BOTTOM_UP_LOGICAL_WARP_SIZE;
+
+		//Inactive threads are not a pb for __ballot (known behaviour)
+		for (IndexType idx = logical_warps_per_block * blockIdx.x + logical_warp_id;
+				idx < left_unvisited_size;
+				idx += gridDim.x * logical_warps_per_block) {
+
+			//Unvisited vertices - potentially in the next frontier
+			IndexType v = left_unvisited[idx];
+
+			//Used only with symmetric graphs
+			//Parents are included in v's neighbors
+			IndexType first_i_edge = row_ptr[v] + MAIN_BOTTOMUP_MAX_EDGES; //we already have checked the first MAIN_BOTTOMUP_MAX_EDGES edges in find_unvisited
+
+			IndexType end_i_edge = row_ptr[v + 1];
+
+			//We can have warp divergence in the next loop
+			//It's not a pb because the behaviour of __ballot
+			//is know with inactive threads
+			for (IndexType i_edge = first_i_edge + logical_lane_id;
+					i_edge < end_i_edge;
+					i_edge += BOTTOM_UP_LOGICAL_WARP_SIZE) {
+
+				IndexType valid_parent = -1;
+
+				if (!edge_mask || edge_mask[i_edge]) {
+					IndexType u = col_ind[i_edge];
+					IndexType lvl_u = distances[u];
+
+					if (lvl_u == (lvl - 1)) {
+						valid_parent = u;
+					}
+				}
+
+				unsigned int warp_valid_p_ballot = nvgraph::utils::ballot((valid_parent != -1));
+
+				int logical_warp_id_in_warp = (threadIdx.x % WARP_SIZE) / BOTTOM_UP_LOGICAL_WARP_SIZE;
+				unsigned int mask = (1 << BOTTOM_UP_LOGICAL_WARP_SIZE) - 1;
+				unsigned int logical_warp_valid_p_ballot = warp_valid_p_ballot
+						>> (BOTTOM_UP_LOGICAL_WARP_SIZE * logical_warp_id_in_warp);
+				logical_warp_valid_p_ballot &= mask;
+
+				int chosen_thread = __ffs(logical_warp_valid_p_ballot) - 1;
+
+				if (chosen_thread == logical_lane_id) {
+					//Using only one valid parent (reduce bw)
+					IndexType off = atomicAdd(new_frontier_cnt, (IndexType) 1);
+					int m = 1 << (v % INT_SIZE);
+					atomicOr(&visited[v / INT_SIZE], m);
+					distances[v] = lvl;
+
+					if (predecessors)
+						predecessors[v] = valid_parent;
+
+					new_frontier[off] = v;
+				}
+
+				if (logical_warp_valid_p_ballot) {
+					break;
+				}
+			}
+
+		}
+	}
+
+	template<typename IndexType>
+	void bottom_up_large(IndexType *left_unvisited,
+								IndexType left_unvisited_size,
+								int *visited,
+								const IndexType *row_ptr,
+								const IndexType *col_ind,
+								IndexType lvl,
+								IndexType *new_frontier,
+								IndexType *new_frontier_idx,
+								IndexType *distances,
+								IndexType *predecessors,
+								int *edge_mask,
+								cudaStream_t m_stream,
+								bool deterministic) {
+		dim3 grid, block;
+		block.x = LARGE_BOTTOMUP_DIMX;
+		grid.x = min(	(IndexType) MAXBLOCKS,
+							((left_unvisited_size + block.x - 1) * BOTTOM_UP_LOGICAL_WARP_SIZE) / block.x);
+
+		bottom_up_large_degree_kernel<<<grid, block, 0, m_stream>>>(left_unvisited,
+																						left_unvisited_size,
+																						visited,
+																						row_ptr,
+																						col_ind,
+																						lvl,
+																						new_frontier,
+																						new_frontier_idx,
+																						distances,
+																						predecessors,
+																						edge_mask);
+		cudaCheckError()
+		;
+	}
+
+	//
+	//
+	//  ------------------------------ Top down ------------------------------
+	//
+	//
+
+	//
+	// compute_bucket_offsets_kernel
+	// simply compute the position in the frontier corresponding all valid edges with index=TOP_DOWN_BUCKET_SIZE * k, k integer
+	//
+
+	template<typename IndexType>
+	__global__ void compute_bucket_offsets_kernel(	const IndexType *frontier_degrees_exclusive_sum,
+																	IndexType *bucket_offsets,
+																	const IndexType frontier_size,
+																	IndexType total_degree) {
+		IndexType end = ((total_degree - 1 + TOP_DOWN_EXPAND_DIMX) / TOP_DOWN_EXPAND_DIMX
+				* NBUCKETS_PER_BLOCK + 1);
+
+		for (IndexType bid = blockIdx.x * blockDim.x + threadIdx.x;
+				bid <= end;
+				bid += gridDim.x * blockDim.x) {
+
+			IndexType eid = min(bid * TOP_DOWN_BUCKET_SIZE, total_degree - 1);
+
+			bucket_offsets[bid] = binsearch_maxle(	frontier_degrees_exclusive_sum,
+																eid,
+																(IndexType) 0,
+																frontier_size - 1);
+
+		}
+	}
+
+	template<typename IndexType>
+	void compute_bucket_offsets(	IndexType *cumul,
+											IndexType *bucket_offsets,
+											IndexType frontier_size,
+											IndexType total_degree,
+											cudaStream_t m_stream) {
+		dim3 grid, block;
+		block.x = COMPUTE_BUCKET_OFFSETS_DIMX;
+
+		grid.x = min(	(IndexType) MAXBLOCKS,
+							((total_degree - 1 + TOP_DOWN_EXPAND_DIMX) / TOP_DOWN_EXPAND_DIMX
+									* NBUCKETS_PER_BLOCK + 1 + block.x - 1) / block.x);
+
+		compute_bucket_offsets_kernel<<<grid, block, 0, m_stream>>>(cumul,
+																						bucket_offsets,
+																						frontier_size,
+																						total_degree);
+		cudaCheckError()
+		;
+	}
+
+	//
+	// topdown_expand_kernel
+	// Read current frontier and compute new one with top down paradigm
+	// One thread = One edge
+	// To know origin of edge, we have to find where is index_edge in the values of frontier_degrees_exclusive_sum (using a binary search, max less or equal than)
+	// This index k will give us the origin of this edge, which is frontier[k]
+	// This thread will then process the (linear_idx_thread - frontier_degrees_exclusive_sum[k])-ith edge of vertex frontier[k]
+	//
+	// To process blockDim.x = TOP_DOWN_EXPAND_DIMX edges, we need to first load NBUCKETS_PER_BLOCK bucket offsets - those will help us do the binary searches
+	// We can load up to TOP_DOWN_EXPAND_DIMX of those bucket offsets - that way we prepare for the next MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD * blockDim.x edges
+	//
+	// Once we have those offsets, we may still need a few values from frontier_degrees_exclusive_sum to compute exact index k
+	// To be able to do it, we will load the values that we need from frontier_degrees_exclusive_sum in shared memory
+	// We know that it will fit because we never add node with degree == 0 in the frontier, so we have an upper bound on the number of value to load (see below)
+	//
+	// We will then look which vertices are not visited yet :
+	// 1) if the unvisited vertex is isolated (=> degree == 0), we mark it as visited, update distances and predecessors, and move on
+	// 2) if the unvisited vertex has degree > 0, we add it to the "frontier_candidates" queue
+	//
+	// We then treat the candidates queue using the threadIdx.x < ncandidates
+	// If we are indeed the first thread to discover that vertex (result of atomicOr(visited))
+	// We add it to the new frontier
+	//
+
+	template<typename IndexType>
+	__global__ void topdown_expand_kernel(	const IndexType *row_ptr,
+														const IndexType *col_ind,
+														const IndexType *frontier,
+														const IndexType frontier_size,
+														const IndexType totaldegree,
+														const IndexType max_items_per_thread,
+														const IndexType lvl,
+														IndexType *new_frontier,
+														IndexType *new_frontier_cnt,
+														const IndexType *frontier_degrees_exclusive_sum,
+														const IndexType *frontier_degrees_exclusive_sum_buckets_offsets,
+														int *bmap,
+														IndexType *distances,
+														IndexType *predecessors,
+														const int *edge_mask,
+														const int *isolated_bmap,
+														bool directed) {
+		//BlockScan
+		typedef cub::BlockScan<IndexType, TOP_DOWN_EXPAND_DIMX> BlockScan;
+		__shared__ typename BlockScan::TempStorage scan_storage;
+
+		// We will do a scan to know where to write in frontier
+		// This will contain the common offset of the block
+		__shared__ IndexType frontier_common_block_offset;
+
+		__shared__ IndexType shared_buckets_offsets[TOP_DOWN_EXPAND_DIMX - NBUCKETS_PER_BLOCK + 1];
+		__shared__ IndexType shared_frontier_degrees_exclusive_sum[TOP_DOWN_EXPAND_DIMX + 1];
+
+		//
+		// Frontier candidates local queue
+		// We process TOP_DOWN_BATCH_SIZE vertices in parallel, so we need to be able to store everything
+		// We also save the predecessors here, because we will not be able to retrieve it after
+		//
+		__shared__ IndexType shared_local_new_frontier_candidates[TOP_DOWN_BATCH_SIZE
+				* TOP_DOWN_EXPAND_DIMX];
+		__shared__ IndexType shared_local_new_frontier_predecessors[TOP_DOWN_BATCH_SIZE
+				* TOP_DOWN_EXPAND_DIMX];
+		__shared__ IndexType block_n_frontier_candidates;
+
+		IndexType block_offset = (blockDim.x * blockIdx.x) * max_items_per_thread;
+		IndexType n_items_per_thread_left = (totaldegree - block_offset + TOP_DOWN_EXPAND_DIMX - 1)
+				/ TOP_DOWN_EXPAND_DIMX;
+
+		n_items_per_thread_left = min(max_items_per_thread, n_items_per_thread_left);
+
+		for (;
+				(n_items_per_thread_left > 0) && (block_offset < totaldegree);
+
+				block_offset += MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD * blockDim.x,
+						n_items_per_thread_left -= MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD) {
+
+			// In this loop, we will process batch_set_size batches
+			IndexType nitems_per_thread = min(	n_items_per_thread_left,
+															(IndexType) MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD);
+
+			// Loading buckets offset (see compute_bucket_offsets_kernel)
+
+			if (threadIdx.x < (nitems_per_thread * NBUCKETS_PER_BLOCK + 1))
+				shared_buckets_offsets[threadIdx.x] =
+						frontier_degrees_exclusive_sum_buckets_offsets[block_offset / TOP_DOWN_BUCKET_SIZE
+								+ threadIdx.x];
+
+			// We will use shared_buckets_offsets
+			__syncthreads();
+
+			//
+			// shared_buckets_offsets gives us a range of the possible indexes
+			// for edge of linear_threadx, we are looking for the value k such as
+			// k is the max value such as frontier_degrees_exclusive_sum[k] <= linear_threadx
+			//
+			// we have 0 <= k < frontier_size
+			// but we also have :
+			//
+			// frontier_degrees_exclusive_sum_buckets_offsets[linear_threadx/TOP_DOWN_BUCKET_SIZE]
+			// <= k
+			// <= frontier_degrees_exclusive_sum_buckets_offsets[linear_threadx/TOP_DOWN_BUCKET_SIZE + 1]
+			//
+			// To find the exact value in that range, we need a few values from frontier_degrees_exclusive_sum (see below)
+			// We will load them here
+			// We will load as much as we can - if it doesn't fit we will make multiple iteration of the next loop
+			// Because all vertices in frontier have degree > 0, we know it will fits if left + 1 = right (see below)
+
+			//We're going to load values in frontier_degrees_exclusive_sum for batch [left; right[
+			//If it doesn't fit, --right until it does, then loop
+			//It is excepted to fit on the first try, that's why we start right = nitems_per_thread
+
+			IndexType left = 0;
+			IndexType right = nitems_per_thread;
+
+			while (left < nitems_per_thread) {
+				//
+				// Values that are necessary to compute the local binary searches
+				// We only need those with indexes between extremes indexes of buckets_offsets
+				// We need the next val for the binary search, hence the +1
+				//
+
+				IndexType nvalues_to_load = shared_buckets_offsets[right * NBUCKETS_PER_BLOCK]
+						- shared_buckets_offsets[left * NBUCKETS_PER_BLOCK] + 1;
+
+				//If left = right + 1 we are sure to have nvalues_to_load < TOP_DOWN_EXPAND_DIMX+1
+				while (nvalues_to_load > (TOP_DOWN_EXPAND_DIMX + 1)) {
+					--right;
+
+					nvalues_to_load = shared_buckets_offsets[right * NBUCKETS_PER_BLOCK]
+							- shared_buckets_offsets[left * NBUCKETS_PER_BLOCK] + 1;
+				}
+
+				IndexType nitems_per_thread_for_this_load = right - left;
+
+				IndexType frontier_degrees_exclusive_sum_block_offset = shared_buckets_offsets[left
+						* NBUCKETS_PER_BLOCK];
+
+				//TODO put again the nvalues_to_load == 1
+				if (threadIdx.x < nvalues_to_load) {
+					shared_frontier_degrees_exclusive_sum[threadIdx.x] =
+							frontier_degrees_exclusive_sum[frontier_degrees_exclusive_sum_block_offset
+									+ threadIdx.x];
+				}
+
+				if (nvalues_to_load == (TOP_DOWN_EXPAND_DIMX + 1) && threadIdx.x == 0) {
+					shared_frontier_degrees_exclusive_sum[TOP_DOWN_EXPAND_DIMX] =
+							frontier_degrees_exclusive_sum[frontier_degrees_exclusive_sum_block_offset
+									+ TOP_DOWN_EXPAND_DIMX];
+				}
+
+				//shared_frontier_degrees_exclusive_sum is in shared mem, we will use it, sync
+				//TODO we don't use it if nvalues_to_load == 1
+				__syncthreads();
+
+				// Now we will process the edges
+				// Here each thread will process nitems_per_thread_for_this_load
+				for (IndexType item_index = 0;
+						item_index < nitems_per_thread_for_this_load;
+						item_index += TOP_DOWN_BATCH_SIZE) {
+
+					// We process TOP_DOWN_BATCH_SIZE edge in parallel (instruction parallism)
+					// Reduces latency
+
+					IndexType current_max_edge_index = min(block_offset
+																				+ (left
+																						+ nitems_per_thread_for_this_load)
+																						* blockDim.x,
+																		totaldegree);
+
+					//We will need vec_u (source of the edge) until the end if we need to save the predecessors
+					//For others informations, we will reuse pointers on the go (nvcc does not color well the registers in that case)
+
+					IndexType vec_u[TOP_DOWN_BATCH_SIZE];
+					IndexType local_buf1[TOP_DOWN_BATCH_SIZE];
+					IndexType local_buf2[TOP_DOWN_BATCH_SIZE];
+
+					IndexType *vec_frontier_degrees_exclusive_sum_index = &local_buf2[0];
+
+#pragma unroll
+					for (IndexType iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+
+						IndexType ibatch = left + item_index + iv;
+						IndexType gid = block_offset + ibatch * blockDim.x + threadIdx.x;
+
+						if (gid < current_max_edge_index) {
+							IndexType start_off_idx = (ibatch * blockDim.x + threadIdx.x)
+									/ TOP_DOWN_BUCKET_SIZE;
+							IndexType bucket_start = shared_buckets_offsets[start_off_idx]
+									- frontier_degrees_exclusive_sum_block_offset;
+							IndexType bucket_end = shared_buckets_offsets[start_off_idx + 1]
+									- frontier_degrees_exclusive_sum_block_offset;
+
+							IndexType k = binsearch_maxle(shared_frontier_degrees_exclusive_sum,
+																	gid,
+																	bucket_start,
+																	bucket_end)
+									+ frontier_degrees_exclusive_sum_block_offset;
+							vec_u[iv] = frontier[k]; // origin of this edge
+							vec_frontier_degrees_exclusive_sum_index[iv] =
+									frontier_degrees_exclusive_sum[k];
+						} else {
+							vec_u[iv] = -1;
+							vec_frontier_degrees_exclusive_sum_index[iv] = -1;
+						}
+
+					}
+
+					IndexType *vec_row_ptr_u = &local_buf1[0];
+#pragma unroll
+					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+						IndexType u = vec_u[iv];
+						//row_ptr for this vertex origin u
+						vec_row_ptr_u[iv] = (u != -1)
+													? row_ptr[u]
+														:
+														-1;
+					}
+
+					//We won't need row_ptr after that, reusing pointer
+					IndexType *vec_dest_v = vec_row_ptr_u;
+
+#pragma unroll
+					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+						IndexType thread_item_index = left + item_index + iv;
+						IndexType gid = block_offset + thread_item_index * blockDim.x + threadIdx.x;
+
+						IndexType row_ptr_u = vec_row_ptr_u[iv];
+						IndexType edge = row_ptr_u + gid - vec_frontier_degrees_exclusive_sum_index[iv];
+
+						if (edge_mask && !edge_mask[edge])
+							row_ptr_u = -1; //disabling edge
+
+						//Destination of this edge
+						vec_dest_v[iv] = (row_ptr_u != -1)
+												? col_ind[edge]
+													:
+													-1;
+					}
+
+					//We don't need vec_frontier_degrees_exclusive_sum_index anymore
+					IndexType *vec_v_visited_bmap = vec_frontier_degrees_exclusive_sum_index;
+#pragma unroll
+					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+						IndexType v = vec_dest_v[iv];
+						vec_v_visited_bmap[iv] = (v != -1)
+															? bmap[v / INT_SIZE]
+																:
+																(~0); //will look visited
+					}
+
+					// From now on we will consider v as a frontier candidate
+					// If for some reason vec_candidate[iv] should be put in the new_frontier
+					// Then set vec_candidate[iv] = -1
+					IndexType *vec_frontier_candidate = vec_dest_v;
+
+#pragma unroll
+					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+						IndexType v = vec_frontier_candidate[iv];
+						int m = 1 << (v % INT_SIZE);
+
+						int is_visited = vec_v_visited_bmap[iv] & m;
+
+						if (is_visited)
+							vec_frontier_candidate[iv] = -1;
+					}
+
+					if (directed) {
+						//vec_v_visited_bmap is available
+
+						IndexType *vec_is_isolated_bmap = vec_v_visited_bmap;
+
+#pragma unroll
+						for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+							IndexType v = vec_frontier_candidate[iv];
+							vec_is_isolated_bmap[iv] = (v != -1)
+																? isolated_bmap[v / INT_SIZE]
+																	:
+																	-1;
+						}
+
+#pragma unroll
+						for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+							IndexType v = vec_frontier_candidate[iv];
+							int m = 1 << (v % INT_SIZE);
+							int is_isolated = vec_is_isolated_bmap[iv] & m;
+
+							//If v is isolated, we will not add it to the frontier (it's not a frontier candidate)
+							// 1st reason : it's useless
+							// 2nd reason : it will make top down algo fail
+							// we need each node in frontier to have a degree > 0
+							// If it is isolated, we just need to mark it as visited, and save distance and predecessor here. Not need to check return value of atomicOr
+
+							if (is_isolated && v != -1) {
+								int m = 1 << (v % INT_SIZE);
+								atomicOr(&bmap[v / INT_SIZE], m);
+								if (distances)
+									distances[v] = lvl;
+
+								if (predecessors)
+									predecessors[v] = vec_u[iv];
+
+								//This is no longer a candidate, neutralize it
+								vec_frontier_candidate[iv] = -1;
+							}
+
+						}
+					}
+
+					//Number of successor candidate hold by this thread
+					IndexType thread_n_frontier_candidates = 0;
+
+#pragma unroll
+					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+						IndexType v = vec_frontier_candidate[iv];
+						if (v != -1)
+							++thread_n_frontier_candidates;
+					}
+
+					// We need to have all nfrontier_candidates to be ready before doing the scan
+					__syncthreads();
+
+					// We will put the frontier candidates in a local queue
+					// Computing offsets
+					IndexType thread_frontier_candidate_offset = 0; //offset inside block
+					BlockScan(scan_storage).ExclusiveSum(	thread_n_frontier_candidates,
+																		thread_frontier_candidate_offset);
+
+#pragma unroll
+					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+						//May have bank conflicts
+						IndexType frontier_candidate = vec_frontier_candidate[iv];
+
+						if (frontier_candidate != -1) {
+							shared_local_new_frontier_candidates[thread_frontier_candidate_offset] =
+									frontier_candidate;
+							shared_local_new_frontier_predecessors[thread_frontier_candidate_offset] =
+									vec_u[iv];
+							++thread_frontier_candidate_offset;
+						}
+					}
+
+					if (threadIdx.x == (TOP_DOWN_EXPAND_DIMX - 1)) {
+						//No need to add nsuccessor_candidate, even if its an
+						//exclusive sum
+						//We incremented the thread_frontier_candidate_offset
+						block_n_frontier_candidates = thread_frontier_candidate_offset;
+					}
+
+					//broadcast block_n_frontier_candidates
+					__syncthreads();
+
+					IndexType naccepted_vertices = 0;
+					//We won't need vec_frontier_candidate after that
+					IndexType *vec_frontier_accepted_vertex = vec_frontier_candidate;
+
+#pragma unroll
+					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+						const int idx_shared = iv * blockDim.x + threadIdx.x;
+						vec_frontier_accepted_vertex[iv] = -1;
+
+						if (idx_shared < block_n_frontier_candidates) {
+							IndexType v = shared_local_new_frontier_candidates[idx_shared]; //popping queue
+							int m = 1 << (v % INT_SIZE);
+							int q = atomicOr(&bmap[v / INT_SIZE], m); //atomicOr returns old
+
+							if (!(m & q)) { //if this thread was the first to discover this node
+								if (distances)
+									distances[v] = lvl;
+
+								if (predecessors) {
+									IndexType pred = shared_local_new_frontier_predecessors[idx_shared];
+									predecessors[v] = pred;
+								}
+
+								vec_frontier_accepted_vertex[iv] = v;
+								++naccepted_vertices;
+							}
+						}
+
+					}
+
+					//We need naccepted_vertices to be ready
+					__syncthreads();
+
+					IndexType thread_new_frontier_offset;
+
+					BlockScan(scan_storage).ExclusiveSum(naccepted_vertices, thread_new_frontier_offset);
+
+					if (threadIdx.x == (TOP_DOWN_EXPAND_DIMX - 1)) {
+
+						IndexType inclusive_sum = thread_new_frontier_offset + naccepted_vertices;
+						//for this thread, thread_new_frontier_offset + has_successor (exclusive sum)
+						if (inclusive_sum)
+							frontier_common_block_offset = atomicAdd(new_frontier_cnt, inclusive_sum);
+					}
+
+					//Broadcasting frontier_common_block_offset
+					__syncthreads();
+
+#pragma unroll
+					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+						const int idx_shared = iv * blockDim.x + threadIdx.x;
+						if (idx_shared < block_n_frontier_candidates) {
+
+							IndexType new_frontier_vertex = vec_frontier_accepted_vertex[iv];
+
+							if (new_frontier_vertex != -1) {
+								IndexType off = frontier_common_block_offset + thread_new_frontier_offset++;
+								//TODO Access is not good
+								new_frontier[off] = new_frontier_vertex;
+							}
+						}
+					}
+
+				}
+
+				//We need to keep shared_frontier_degrees_exclusive_sum coherent
+				__syncthreads();
+
+				//Preparing for next load
+				left = right;
+				right = nitems_per_thread;
+			}
+
+			//we need to keep shared_buckets_offsets coherent
+			__syncthreads();
+		}
+
+	}
+
+	template<typename IndexType>
+	void frontier_expand(const IndexType *row_ptr,
+								const IndexType *col_ind,
+								const IndexType *frontier,
+								const IndexType frontier_size,
+								const IndexType totaldegree,
+								const IndexType lvl,
+								IndexType *new_frontier,
+								IndexType *new_frontier_cnt,
+								const IndexType *frontier_degrees_exclusive_sum,
+								const IndexType *frontier_degrees_exclusive_sum_buckets_offsets,
+								int *visited_bmap,
+								IndexType *distances,
+								IndexType *predecessors,
+								const int *edge_mask,
+								const int *isolated_bmap,
+								bool directed,
+								cudaStream_t m_stream,
+								bool deterministic) {
+		if (!totaldegree)
+			return;
+
+		dim3 block;
+		block.x = TOP_DOWN_EXPAND_DIMX;
+
+		IndexType max_items_per_thread = (totaldegree + MAXBLOCKS * block.x - 1)
+				/ (MAXBLOCKS * block.x);
+
+		dim3 grid;
+		grid.x = min(	(totaldegree + max_items_per_thread * block.x - 1)
+									/ (max_items_per_thread * block.x),
+							(IndexType) MAXBLOCKS);
+
+		topdown_expand_kernel<<<grid, block, 0, m_stream>>>(	row_ptr,
+																				col_ind,
+																				frontier,
+																				frontier_size,
+																				totaldegree,
+																				max_items_per_thread,
+																				lvl,
+																				new_frontier,
+																				new_frontier_cnt,
+																				frontier_degrees_exclusive_sum,
+																				frontier_degrees_exclusive_sum_buckets_offsets,
+																				visited_bmap,
+																				distances,
+																				predecessors,
+																				edge_mask,
+																				isolated_bmap,
+																				directed);
+		cudaCheckError()
+		;
+	}
+
+	template<typename IndexType>
+	__global__ void flag_isolated_vertices_kernel(	IndexType n,
+																	int *isolated_bmap,
+																	const IndexType *row_ptr,
+																	IndexType *degrees,
+																	IndexType *nisolated) {
+		typedef cub::BlockLoad<IndexType, FLAG_ISOLATED_VERTICES_DIMX,
+				FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+		typedef cub::BlockStore<IndexType, FLAG_ISOLATED_VERTICES_DIMX,
+				FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD, cub::BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+		typedef cub::BlockReduce<IndexType, FLAG_ISOLATED_VERTICES_DIMX> BlockReduce;
+		typedef cub::WarpReduce<int, FLAG_ISOLATED_VERTICES_THREADS_PER_INT> WarpReduce;
+
+		__shared__ typename BlockLoad::TempStorage load_temp_storage;
+		__shared__ typename BlockStore::TempStorage store_temp_storage;
+		__shared__ typename BlockReduce::TempStorage block_reduce_temp_storage;
+
+		__shared__ typename WarpReduce::TempStorage warp_reduce_temp_storage[FLAG_ISOLATED_VERTICES_DIMX
+				/ FLAG_ISOLATED_VERTICES_THREADS_PER_INT];
+
+		__shared__ IndexType row_ptr_tail[FLAG_ISOLATED_VERTICES_DIMX];
+
+		for (IndexType block_off = FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD
+				* (blockDim.x * blockIdx.x);
+				block_off < n;
+				block_off += FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * (blockDim.x * gridDim.x)) {
+
+			IndexType thread_off = block_off
+					+ FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * threadIdx.x;
+			IndexType last_node_thread = thread_off + FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1;
+
+			IndexType thread_row_ptr[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD];
+			IndexType block_valid_items = n - block_off + 1; //+1, we need row_ptr[last_node+1]
+
+			BlockLoad(load_temp_storage).Load(	row_ptr + block_off,
+															thread_row_ptr,
+															block_valid_items,
+															-1);
+
+			//To compute 4 degrees, we need 5 values of row_ptr
+			//Saving the "5th" value in shared memory for previous thread to use
+			if (threadIdx.x > 0) {
+				row_ptr_tail[threadIdx.x - 1] = thread_row_ptr[0];
+			}
+
+			//If this is the last thread, it needs to load its row ptr tail value
+			if (threadIdx.x == (FLAG_ISOLATED_VERTICES_DIMX - 1) && last_node_thread < n) {
+				row_ptr_tail[threadIdx.x] = row_ptr[last_node_thread + 1];
+
+			}
+			__syncthreads(); // we may reuse temp_storage
+
+			int local_isolated_bmap = 0;
+
+			IndexType imax = (n - thread_off);
+
+			IndexType local_degree[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD];
+
+#pragma unroll
+			for (int i = 0; i < (FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1); ++i) {
+				IndexType degree = local_degree[i] = thread_row_ptr[i + 1] - thread_row_ptr[i];
+
+				if (i < imax)
+					local_isolated_bmap |= ((degree == 0) << i);
+			}
+
+			if (last_node_thread < n) {
+				IndexType degree = local_degree[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1] =
+						row_ptr_tail[threadIdx.x]
+								- thread_row_ptr[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1];
+
+				local_isolated_bmap |= ((degree == 0)
+						<< (FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1));
+
+			}
+
+			local_isolated_bmap <<= (thread_off % INT_SIZE);
+
+			IndexType local_nisolated = __popc(local_isolated_bmap);
+
+			//We need local_nisolated and local_isolated_bmap to be ready for next steps
+			__syncthreads();
+
+			IndexType total_nisolated = BlockReduce(block_reduce_temp_storage).Sum(local_nisolated);
+
+			if (threadIdx.x == 0 && total_nisolated) {
+				atomicAdd(nisolated, total_nisolated);
+			}
+
+			int logicalwarpid = threadIdx.x / FLAG_ISOLATED_VERTICES_THREADS_PER_INT;
+
+			//Building int for bmap
+			int int_aggregate_isolated_bmap =
+					WarpReduce(warp_reduce_temp_storage[logicalwarpid]).Reduce(	local_isolated_bmap,
+																									BitwiseOr());
+
+			int is_head_of_visited_int =
+					((threadIdx.x % (FLAG_ISOLATED_VERTICES_THREADS_PER_INT)) == 0);
+			if (is_head_of_visited_int) {
+				isolated_bmap[thread_off / INT_SIZE] = int_aggregate_isolated_bmap;
+			}
+
+			BlockStore(store_temp_storage).Store(degrees + block_off, local_degree, block_valid_items);
+		}
+	}
+
+	template<typename IndexType>
+	void flag_isolated_vertices(	IndexType n,
+											int *isolated_bmap,
+											const IndexType *row_ptr,
+											IndexType *degrees,
+											IndexType *nisolated,
+											cudaStream_t m_stream) {
+		dim3 grid, block;
+		block.x = FLAG_ISOLATED_VERTICES_DIMX;
+
+		grid.x = min(	(IndexType) MAXBLOCKS,
+							(n / FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD + 1 + block.x - 1) / block.x);
+
+		flag_isolated_vertices_kernel<<<grid, block, 0, m_stream>>>(n,
+																						isolated_bmap,
+																						row_ptr,
+																						degrees,
+																						nisolated);
+		cudaCheckError()
+		;
+	}
+
+	//
+	//
+	//
+	// Some utils functions
+	//
+	//
+
+	//Creates CUB data for graph size n
+	template<typename IndexType>
+	void cub_exclusive_sum_alloc(IndexType n, void*& d_temp_storage, size_t &temp_storage_bytes) {
+		// Determine temporary device storage requirements for exclusive prefix scan
+		d_temp_storage = NULL;
+		temp_storage_bytes = 0;
+		IndexType *d_in = NULL, *d_out = NULL;
+		cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, n);
+		// Allocate temporary storage for exclusive prefix scan
+		cudaMalloc(&d_temp_storage, temp_storage_bytes);
+	}
+
+	template<typename IndexType>
+	__global__ void fill_kernel(IndexType *vec, IndexType n, IndexType val) {
+		for (IndexType u = blockDim.x * blockIdx.x + threadIdx.x;
+				u < n;
+				u += gridDim.x * blockDim.x)
+			vec[u] = val;
+
+	}
+
+	template<typename IndexType>
+	void fill(IndexType *vec, IndexType n, IndexType val, cudaStream_t m_stream) {
+		dim3 grid, block;
+		block.x = 256;
+		grid.x = min((n + block.x - 1) / block.x, (IndexType) MAXBLOCKS);
+		fill_kernel<<<grid, block, 0, m_stream>>>(vec, n, val);
+		cudaCheckError()
+		;
+	}
+
+	template<typename IndexType>
+	__global__ void set_frontier_degree_kernel(	IndexType *frontier_degree,
+																IndexType *frontier,
+																const IndexType *degree,
+																IndexType n) {
+		for (IndexType idx = blockDim.x * blockIdx.x + threadIdx.x;
+				idx < n;
+				idx += gridDim.x * blockDim.x) {
+			IndexType u = frontier[idx];
+			frontier_degree[idx] = degree[u];
+		}
+	}
+
+	template<typename IndexType>
+	void set_frontier_degree(	IndexType *frontier_degree,
+										IndexType *frontier,
+										const IndexType *degree,
+										IndexType n,
+										cudaStream_t m_stream) {
+		dim3 grid, block;
+		block.x = 256;
+		grid.x = min((n + block.x - 1) / block.x, (IndexType) MAXBLOCKS);
+		set_frontier_degree_kernel<<<grid, block, 0, m_stream>>>(frontier_degree,
+																					frontier,
+																					degree,
+																					n);
+		cudaCheckError()
+		;
+	}
+
+	template<typename IndexType>
+	void exclusive_sum(	void *d_temp_storage,
+								size_t temp_storage_bytes,
+								IndexType *d_in,
+								IndexType *d_out,
+								IndexType num_items,
+								cudaStream_t m_stream) {
+		if (num_items <= 1)
+			return; //DeviceScan fails if n==1
+		cub::DeviceScan::ExclusiveSum(d_temp_storage,
+												temp_storage_bytes,
+												d_in,
+												d_out,
+												num_items,
+												m_stream);
+	}
+
+	template<typename T>
+	__global__ void fill_vec_kernel(T *vec, T n, T val) {
+		for (T idx = blockIdx.x * blockDim.x + threadIdx.x;
+				idx < n;
+				idx += blockDim.x * gridDim.x)
+			vec[idx] = val;
+	}
+
+	template<typename T>
+	void fill_vec(T *vec, T n, T val, cudaStream_t stream) {
+		dim3 grid, block;
+		block.x = 256;
+		grid.x = (n + block.x - 1) / block.x;
+
+		fill_vec_kernel<<<grid, block, 0, stream>>>(vec, n, val);
+		cudaCheckError()
+		;
+	}
+}
+//
diff --git a/cpp/nvgraph/cpp/src/convert.cu b/cpp/nvgraph/cpp/src/convert.cu
new file mode 100644
index 00000000000..bb6c34146ee
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/convert.cu
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nvgraph_convert.hxx"
+#include "nvgraph_error.hxx"
+
+
+
+ namespace nvgraph{
+    void csr2coo( const int *csrSortedRowPtr,
+                  int nnz, int m, int *cooRowInd, cusparseIndexBase_t idxBase){
+      CHECK_CUSPARSE( cusparseXcsr2coo(  Cusparse::get_handle(),
+                                         csrSortedRowPtr, nnz, m, cooRowInd, idxBase ));
+    }
+    void coo2csr( const int *cooRowInd,
+                  int nnz, int m, int *csrSortedRowPtr, cusparseIndexBase_t idxBase){
+      CHECK_CUSPARSE( cusparseXcoo2csr( Cusparse::get_handle(),
+                                        cooRowInd, nnz, m, csrSortedRowPtr, idxBase ));
+    }
+
+
+    void csr2csc( int m, int n, int nnz,
+                  const void *csrVal, const int *csrRowPtr, const int *csrColInd,
+                  void *cscVal, int *cscRowInd, int *cscColPtr,
+                  cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
+                  cudaDataType_t *dataType){
+      CHECK_CUSPARSE( cusparseCsr2cscEx( Cusparse::get_handle(),
+                                         m, n, nnz,
+                                         csrVal, *dataType, csrRowPtr, csrColInd,
+                                         cscVal, *dataType, cscRowInd, cscColPtr,
+                                         copyValues, idxBase, *dataType ));
+    }
+    void csc2csr( int m, int n, int nnz,
+                  const void *cscVal, const int *cscRowInd, const int *cscColPtr,
+                  void *csrVal, int *csrRowPtr, int *csrColInd,
+                  cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
+                  cudaDataType_t *dataType){
+      CHECK_CUSPARSE( cusparseCsr2cscEx( Cusparse::get_handle(),
+                                         m, n, nnz,
+                                         cscVal, *dataType, cscColPtr, cscRowInd,
+                                         csrVal, *dataType, csrColInd, csrRowPtr,
+                                         copyValues, idxBase, *dataType ));
+    }
+
+
+    void csr2cscP( int m, int n, int nnz,
+                 const int *csrRowPtr, const int *csrColInd,
+                 int *cscRowInd, int *cscColPtr, int *p,
+                 cusparseIndexBase_t idxBase){
+
+      SHARED_PREFIX::shared_ptr<char> pBuffer;
+
+      // Step 1: Allocate buffer
+      size_t pBufferSizeInBytes = 0;
+      csr2csc2BufferSize(m, n, nnz, csrRowPtr, csrColInd, &pBufferSizeInBytes);
+      pBuffer = allocateDevice<char>(pBufferSizeInBytes, NULL);
+      // Step 2: Setup permutation vector P to identity
+      createIdentityPermutation(nnz, p);
+      // Step 3: Convert and get perumation array
+      csr2csc2(m, n, nnz, csrRowPtr, csrColInd, cscRowInd, cscColPtr, p, pBuffer.get(), idxBase);
+    }
+
+
+    void cooSortByDestination(int m, int n, int nnz,
+                const void *srcVal, const int *srcRowInd, const int *srcColInd,
+                void *dstVal, int *dstRowInd, int *dstColInd,
+                cusparseIndexBase_t idxBase, cudaDataType_t *dataType){
+      size_t pBufferSizeInBytes = 0;
+      SHARED_PREFIX::shared_ptr<char> pBuffer;
+      SHARED_PREFIX::shared_ptr<int> P; // permutation array
+
+      // step 0: copy src to dst
+      if(dstRowInd!=srcRowInd)
+        CHECK_CUDA( cudaMemcpy(dstRowInd, srcRowInd, nnz*sizeof(int), cudaMemcpyDefault) );
+      if(dstColInd!=srcColInd)
+        CHECK_CUDA( cudaMemcpy(dstColInd, srcColInd, nnz*sizeof(int), cudaMemcpyDefault) );
+      // step 1: allocate buffer (needed for cooSortByRow)
+      cooSortBufferSize(m, n, nnz, dstRowInd, dstColInd, &pBufferSizeInBytes);
+      pBuffer = allocateDevice<char>(pBufferSizeInBytes, NULL);
+      // step 2: setup permutation vector P to identity
+      P = allocateDevice<int>(nnz, NULL);
+      createIdentityPermutation(nnz, P.get());
+      // step 3: sort COO format by Row
+      cooGetDestinationPermutation(m, n, nnz, dstRowInd, dstColInd, P.get(), pBuffer.get());
+      // step 4: gather sorted cooVals
+      gthrX(nnz, srcVal, dstVal, P.get(), idxBase, dataType);
+    }
+    void cooSortBySource(int m, int n, int nnz,
+                const void *srcVal, const int *srcRowInd, const int *srcColInd,
+                void *dstVal, int *dstRowInd, int *dstColInd,
+                cusparseIndexBase_t idxBase, cudaDataType_t *dataType){
+      size_t pBufferSizeInBytes = 0;
+      SHARED_PREFIX::shared_ptr<char> pBuffer;
+      SHARED_PREFIX::shared_ptr<int> P; // permutation array
+
+      // step 0: copy src to dst
+      CHECK_CUDA( cudaMemcpy(dstRowInd, srcRowInd, nnz*sizeof(int), cudaMemcpyDefault) );
+      CHECK_CUDA( cudaMemcpy(dstColInd, srcColInd, nnz*sizeof(int), cudaMemcpyDefault) );
+      // step 1: allocate buffer (needed for cooSortByRow)
+      cooSortBufferSize(m, n, nnz, dstRowInd, dstColInd, &pBufferSizeInBytes);
+      pBuffer = allocateDevice<char>(pBufferSizeInBytes, NULL);
+      // step 2: setup permutation vector P to identity
+      P = allocateDevice<int>(nnz, NULL);
+      createIdentityPermutation(nnz, P.get());
+      // step 3: sort COO format by Row
+      cooGetSourcePermutation(m, n, nnz, dstRowInd, dstColInd, P.get(), pBuffer.get());
+      // step 4: gather sorted cooVals
+      gthrX(nnz, srcVal, dstVal, P.get(), idxBase, dataType);
+    }
+
+    void coos2csc(int m, int n, int nnz,
+              const void *srcVal, const int *srcRowInd, const int *srcColInd,
+              void *dstVal, int *dstRowInd, int *dstColPtr,
+              cusparseIndexBase_t idxBase, cudaDataType_t *dataType){
+      // coos -> cood -> csc
+      SHARED_PREFIX::shared_ptr<int> tmp = allocateDevice<int>(nnz, NULL);
+      cooSortByDestination(m, n, nnz, srcVal, srcRowInd, srcColInd, dstVal, dstRowInd, tmp.get(), idxBase, dataType);
+      coo2csr(tmp.get(), nnz, m, dstColPtr, idxBase);
+    }
+    void cood2csr(int m, int n, int nnz,
+              const void *srcVal, const int *srcRowInd, const int *srcColInd,
+              void *dstVal, int *dstRowPtr, int *dstColInd,
+              cusparseIndexBase_t idxBase, cudaDataType_t *dataType){
+      // cood -> coos -> csr
+      SHARED_PREFIX::shared_ptr<int> tmp = allocateDevice<int>(nnz, NULL);
+      cooSortBySource(m, n, nnz, srcVal, srcRowInd, srcColInd, dstVal, tmp.get(), dstColInd, idxBase, dataType);
+      coo2csr(tmp.get(), nnz, m, dstRowPtr, idxBase);
+    }
+    void coou2csr(int m, int n, int nnz,
+              const void *srcVal, const int *srcRowInd, const int *srcColInd,
+              void *dstVal, int *dstRowPtr, int *dstColInd,
+              cusparseIndexBase_t idxBase, cudaDataType_t *dataType){
+      cood2csr(m, n, nnz,
+              srcVal, srcRowInd, srcColInd,
+              dstVal, dstRowPtr, dstColInd,
+              idxBase, dataType);
+    }
+    void coou2csc(int m, int n, int nnz,
+              const void *srcVal, const int *srcRowInd, const int *srcColInd,
+              void *dstVal, int *dstRowInd, int *dstColPtr,
+              cusparseIndexBase_t idxBase, cudaDataType_t *dataType){
+      coos2csc(m, n, nnz,
+              srcVal, srcRowInd, srcColInd,
+              dstVal, dstRowInd, dstColPtr,
+              idxBase, dataType);
+    }
+
+    ////////////////////////// Utility functions //////////////////////////
+    void createIdentityPermutation(int n, int *p){
+        CHECK_CUSPARSE( cusparseCreateIdentityPermutation(Cusparse::get_handle(), n, p) );
+    }
+
+    void gthrX( int nnz, const void *y, void *xVal, const int *xInd,
+                cusparseIndexBase_t idxBase, cudaDataType_t *dataType){
+      if(*dataType==CUDA_R_32F){
+        CHECK_CUSPARSE( cusparseSgthr(Cusparse::get_handle(), nnz, (float*)y, (float*)xVal, xInd, idxBase ));
+      } else if(*dataType==CUDA_R_64F) {
+        CHECK_CUSPARSE( cusparseDgthr(Cusparse::get_handle(), nnz, (double*)y, (double*)xVal, xInd, idxBase ));
+      }
+    }
+
+
+    void cooSortBufferSize(int m, int n, int nnz, const int *cooRows, const int *cooCols, size_t *pBufferSizeInBytes) {
+        CHECK_CUSPARSE( cusparseXcoosort_bufferSizeExt( Cusparse::get_handle(),
+                                                        m, n, nnz,
+                                                        cooRows, cooCols, pBufferSizeInBytes ));
+    }
+    void cooGetSourcePermutation(int m, int n, int nnz, int *cooRows, int *cooCols, int *p, void *pBuffer) {
+        CHECK_CUSPARSE( cusparseXcoosortByRow( Cusparse::get_handle(),
+                                                m, n, nnz,
+                                                cooRows, cooCols, p, pBuffer ));
+    }
+    void cooGetDestinationPermutation(int m, int n, int nnz, int *cooRows, int *cooCols, int *p, void *pBuffer) {
+        CHECK_CUSPARSE( cusparseXcoosortByColumn( Cusparse::get_handle(),
+                                                  m, n, nnz,
+                                                  cooRows, cooCols, p, pBuffer ));
+    }
+
+    void csr2csc2BufferSize(int m, int n, int nnz, const int *csrRowPtr, const int *csrColInd, size_t *pBufferSize){
+        CHECK_CUSPARSE( cusparseXcsr2csc2_bufferSizeExt( Cusparse::get_handle(),
+                                                         m, n, nnz,
+                                                         csrRowPtr, csrColInd, pBufferSize ));
+    }
+    void csr2csc2(int m, int n, int nnz,
+                  const int *csrRowPtr, const int *csrColInd,
+                  int *cscRowInd, int *cscColPtr, int *p, void *pBuffer,
+                  cusparseIndexBase_t idxBase){
+        cusparseMatDescr_t descrA;
+        CHECK_CUSPARSE( cusparseCreateMatDescr(&descrA) );
+        CHECK_CUSPARSE( cusparseSetMatIndexBase(descrA, idxBase) );
+        CHECK_CUSPARSE( cusparseXcsr2csc2( Cusparse::get_handle(),
+                                           m, n, nnz,
+                                           descrA,
+                                           csrRowPtr, csrColInd,
+                                           cscColPtr, cscRowInd, p,
+                                           pBuffer ));
+        CHECK_CUSPARSE( cusparseDestroyMatDescr(descrA) );
+    }
+
+} //end namespace nvgraph
diff --git a/cpp/nvgraph/cpp/src/csr_graph.cpp b/cpp/nvgraph/cpp/src/csr_graph.cpp
new file mode 100644
index 00000000000..83a8f6a0003
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/csr_graph.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+#include "csr_graph.hxx"
+
+namespace nvgraph
+{
+
+    template <typename IndexType_>
+    CsrGraph<IndexType_>& CsrGraph<IndexType_>::operator=(const CsrGraph<IndexType_>& graph)
+    {
+
+    }
+
+} // end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/src/csrmv.cu b/cpp/nvgraph/cpp/src/csrmv.cu
new file mode 100644
index 00000000000..b113c9c79e1
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/csrmv.cu
@@ -0,0 +1,984 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /* This file contains the nvgraph generalized implementation of the Duane Merrill's CUB CSRMV using MergePath */
+
+#include "nvgraph_csrmv.hxx"
+#include "exclusive_kv_scan.hxx" //atomics are included in semiring
+#include "semiring.hxx"
+
+#include "nvgraph_error.hxx"
+ 
+//IMPORTANT: IndexType_ must be a signed integer, long, long long etc. Unsigned int is not supported, since -1 is
+ //used as a flag value
+
+ namespace nvgraph{
+
+ //Calculates SM to be used-add to cpp host file
+__forceinline__ cudaError_t SmVersion(int &smVersion, int deviceOrdinal)
+{
+    cudaError_t error = cudaSuccess; //assume sucess and state otherwise if fails condition
+    do
+    {
+        //Find out SM version
+        int major, minor;
+        if (error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, deviceOrdinal)) break;
+        if (error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, deviceOrdinal)) break;
+        smVersion = 100 * major + 10 * minor;
+    } while(0);
+    return error;
+} 
+
+template<
+int _BLOCK_THREADS, //number of threads per thread block
+int _ITEMS_PER_THREAD> //number of items per individual thread
+struct SpmvBlockThread //this is in agent file other template parameters ignoring for now
+{
+//set constants
+	enum
+	{
+		BLOCK_THREADS = _BLOCK_THREADS, //number of threads per thread block
+		ITEMS_PER_THREAD = _ITEMS_PER_THREAD, //number of items per thread per tile(tid) of input
+	};
+};
+
+//This function calculates the MergePath(load-balancing) for each thread by doing a binary search
+//along the diagonal
+template<typename IndexType_>
+__device__ __forceinline__ void MergePathSearch(
+ 		IndexType_ diag,
+ 		IndexType_ *A, //rowoffsets + 1 
+ 		IndexType_ offset, //counter array 
+ 		IndexType_ A_length, 
+ 		IndexType_ B_length, 
+ 		Coord<IndexType_> &pathCoord) //returned by reference stores the path
+ {
+ 	IndexType_ splitMin = max(diag - B_length, IndexType_(0)); //must be nonnegative
+ 	IndexType_ splitMax = min(diag, A_length); //stay in bounds
+	//do binary search along diagonal
+ 	while (splitMin < splitMax)
+ 	{
+ 		IndexType_ splitPivot = (splitMin + splitMax) / 2; //take average integer division-start in middle so can go up or down diagonal
+ 		if (A[splitPivot] <= diag - splitPivot - 1 + offset) //i+j = diag -1 along cross diag **ignored B
+ 			//move up A and down B from (i,j) to (i-1,j+1)
+ 		{
+ 			splitMin = splitPivot + 1; //increase a in case that it is less clearly before split_min <= split_pivot less than average
+ 		}
+ 		else
+ 		{
+ 			//move down A and up B
+ 			splitMax = splitPivot;
+ 		}
+ 	}
+ 	//transform back to array coordinates from cross diagaonl coordinates
+ 	pathCoord.x = min(splitMin, A_length); //make sure do not go out of bounds;
+ 	//constraint i + j  = k
+	pathCoord.y = diag - splitMin;
+ }
+
+ //Spmv search kernel that calls merge path and identifies the merge path starting coordinates for each tile
+ template <typename SpmvBlockThread, typename IndexType_, typename ValueType_>
+ __global__ void DeviceSpmvSearchKernel( //calls device function merge path
+ 	int numMergeTiles, //[input] Number of spmv merge tiles which is the spmv grid size
+ 	Coord<IndexType_> *dTileCoords, //[output] pointer to a temporary array of tile starting coordinates
+ 	CsrMvParams<IndexType_, ValueType_> spParams) //[input] spmv input parameter with corrdponding needed arrays
+{
+	//set the constants for the gpu architecture
+	enum
+	{
+		BLOCK_THREADS = SpmvBlockThread::BLOCK_THREADS,
+		ITEMS_PER_THREAD = SpmvBlockThread::ITEMS_PER_THREAD,
+		TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+	};
+	int tid = threadIdx.x + blockIdx.x * blockDim.x;
+	if (tid <= numMergeTiles) //verify within domain
+	{
+		IndexType_ diag = tid * TILE_ITEMS; 
+		Coord<IndexType_> tileCoord; //each tid will compute its own tile_coordinate
+		//the above coordinate will be stored in tile_coordinate passed by reference
+		//input row pointer starting at csrRowPtr[1] merge path ignores the 0 entry
+		//the first argument to the counting constructor is the size-nnz and the second argument is where to start countings
+		
+		IndexType_ countStart = 0; //if row pointer is 1 based make sure count starts at 1 instead of 0
+		MergePathSearch(diag, spParams.csrRowPtr, countStart, spParams.m, spParams.nnz, tileCoord);
+		//store path of thread in array of coordinates
+		dTileCoords[tid] = tileCoord; //stores (y,x) = (i.j) coord of thread computed*
+	}
+}
+
+//Agent sturct with two main inline functions which compute the spmv
+template<
+typename SpmvPolicyT, // parameterized SpmvBlockThread tuning policy type as listed above
+typename IndexType_, //index value of rowOffsets and ColIndices
+typename ValueType_, //matrix and vector value type
+typename SemiRingType_, //this follows different semiring structs to be passed depending on the enum
+bool hasAlpha, //signifies whether the input parameter alpha is 1 in y = alpha*A*x + beta*A*y
+bool hasBeta> //signifies whether the input parameter beta is 0
+struct AgentSpmv
+{
+	//set constants
+	enum
+	{
+		BLOCK_THREADS = SpmvPolicyT::BLOCK_THREADS,
+		ITEMS_PER_THREAD = SpmvPolicyT::ITEMS_PER_THREAD,
+		TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+	};
+//we use the return type pair for scanning where the pairs are accumulated segment-value with segemn-index
+	__device__ __forceinline__ KeyValuePair<IndexType_,ValueType_> consumeTile(
+		Coord<IndexType_> tileStartCoord, //this gives the starting coordinate to be determined from the initial mergepath call
+		Coord<IndexType_> tileEndCoord,
+		CsrMvParams<IndexType_, ValueType_> &spParams,
+		SemiRingType_ SR) //pass struct as a const reference
+	{	
+		
+		IndexType_ tileNumRows = tileEndCoord.x - tileStartCoord.x; //length(rowOffSets) = numRows + 1 in merge path ignore first element for 1 and so length of path in x-direction gives the exact number of rows
+		IndexType_ tileNnz = tileEndCoord.y - tileStartCoord.y; //number of nonzero goes down path countingITerator is indexed by columnInd and Val array which are of size nnz
+		//load row offsets into shared memory-create shared memory row offset pointer
+		__shared__ IndexType_ smemTileRowPtr[ITEMS_PER_THREAD + TILE_ITEMS + 1];
+		//copy row offsets into shared memory for accumulating matrix vector dot products in the merge path
+		for (int item = threadIdx.x; item <= tileNumRows; item += BLOCK_THREADS) //index by block_threads that is the number of threads per block
+			//start with rowoffsets at the strat coordinate and corresponding threadId can modiy wd to do a cache wrapper for efficiency later
+		{
+			if ((tileStartCoord.x + item) < spParams.m) //memory protection since already at +1 only go up to m
+			{
+				smemTileRowPtr[item] = spParams.csrRowPtr[tileStartCoord.x + item];
+			}
+		}
+
+	//after loading into shared memory we must sync the threads to make sure all complete
+		__syncthreads();
+		Coord<IndexType_> threadStartCoord;
+		//call MergePath again on shared memory after using start indices
+		IndexType_ diag = threadIdx.x * ITEMS_PER_THREAD; //compute diagonal
+		//shared memory row pointer has been indexed down to 0 so count offset can start at 0 too
+		//counter iterator starts at current y position
+		IndexType_ countIndId = tileStartCoord.y;
+		MergePathSearch(diag, 
+						smemTileRowPtr, //sort list A = row offsets in shared memort
+						countIndId, //sort list B = natural number consecutive counting indices starting index
+						tileNumRows,
+						tileNnz,
+						threadStartCoord); //resulting path is stored in threadStartCoord 
+		__syncthreads(); //make sure every thread has completed their diagonal of merge path
+
+		//Compute the thread's merge path segment to perform the dot product foing down the merge path below in the loop
+		Coord<IndexType_> threadCurrentCoord = threadStartCoord;
+		KeyValuePair<IndexType_, ValueType_> scanSegment[ITEMS_PER_THREAD]; //static array of type key value pairs
+		//initialize each dot product contribution to 0
+		ValueType_ totalValue;
+		SR.setPlus_ident(totalValue);//initialize to semiring identity for plus operation
+		#pragma unroll //unroll for loop for efficiency
+		for (int item = 0; item < ITEMS_PER_THREAD; ++item) //loop over items belonging to thread along merge path
+		{
+			//go down merge path and sum. when move to right new component of result vector y
+			//countInd is consecutive nonzero natural number array going down the matrix B so 
+			//indexed by y whereas rowOffset goes to the move and is A indexed by x
+			countIndId = threadCurrentCoord.y + tileStartCoord.y; //line number problem 
+			
+			IndexType_ nnzId = min(countIndId, spParams.nnz - 1); //make sure stay in bounds
+			IndexType_ colIdx = spParams.csrColInd[nnzId];
+			
+			ValueType_ A_val = spParams.csrVal[nnzId]; //A val
+			//we assume A and x are of the same datatype
+			//recall standard algorithm : y[row] += val[nz]*x[colInd[nnz]] in traditional sparse matrix vector form
+			ValueType_ x_val = spParams.x[colIdx]; //csrColInd[nnzId]
+			//wrapper of x vector could change dependent on the architecture
+			//counter will tell direction to move either right or down since last entry of rowoffsets is the totla number of nonzeros
+			//the counter array keeps track of this
+			if (countIndId < smemTileRowPtr[threadCurrentCoord.x]) //this means less than the number of nonzeros in that row
+			{ //move down current row accumulating matrix and vector dot product
+				totalValue = SR.plus(SR.times(A_val, x_val), totalValue); //add binary operation because may change to minus and min rather than + and *
+				//store in key value pair
+				scanSegment[item].key = tileNumRows;
+				scanSegment[item].value = totalValue;
+				++threadCurrentCoord.y;
+			}
+			else  //move right to new row and reset
+			{//added in else if condition
+				scanSegment[item].key = threadCurrentCoord.x;
+				scanSegment[item].value = totalValue; //store current without adding new and set to 0 for new row
+				SR.setPlus_ident(totalValue);//0.0;//SR.times_null;
+				++threadCurrentCoord.x;
+			}
+		}
+		__syncthreads(); //now each thread block has their matrix vector multiplication and we must do a blockwide reduction
+		//Block-wide reduce-value-by-segment
+		KeyValuePair<IndexType_, ValueType_> scanItem, tileCarry; //this is the key value pair that we will be returning
+
+		scanItem.key = threadCurrentCoord.x; //added min in other version had min with num rows
+		scanItem.value = totalValue;
+
+		PrefixSum<IndexType_, ValueType_, SemiRingType_, BLOCK_THREADS>(SR).ExclusiveKeyValueScan(scanItem, tileCarry);
+		if (tileNumRows > 0)
+		{
+			if (threadIdx.x == 0)
+				scanItem.key = -1; //can be negative imp to be int rather than unsigned int
+			//do a direct scatter
+			#pragma unroll
+			for (int item = 0; item <  ITEMS_PER_THREAD; ++item)
+			{
+				if (scanSegment[item].key < tileNumRows) //scanSegment is an array of key value pairs
+				{
+					if (scanItem.key == scanSegment[item].key)
+					{	
+						scanSegment[item].value = SR.plus(scanItem.value, scanSegment[item].value);
+					}
+
+					if (hasAlpha){
+					//boolean set to 1 need to multiply Ax by alpha as stored in spParams
+						scanSegment[item].value = SR.times(spParams.alpha, scanSegment[item].value);
+					}
+
+					//check if has beta then need to alter y the right hand side is multiplied by beta
+					if (hasBeta)
+					{ //y = alpha*A*x + beta*y
+						ValueType_ y_val = spParams.y[tileStartCoord.x + scanSegment[item].key]; //currentxcoord is stored in the key and this will give corresponding and desired row entry in y 
+						scanSegment[item].value = SR.plus(SR.times(spParams.beta, y_val), scanSegment[item].value);
+					}
+
+					//Set the output vector row element
+					spParams.y[tileStartCoord.x + scanSegment[item].key] = scanSegment[item].value; //disjoint keys
+				}
+			}
+		}
+		//Return the til'es running carry-out key value pair
+		return tileCarry; //will come from exclusive scan
+	}
+
+	//overload consumetile function for the one in the interafce which will be called by the dispatch function
+	__device__ __forceinline__ void consumeTile (
+		Coord<IndexType_> *dTileCoords, //pointer to the temporary array of tile starting cooordinates
+		IndexType_ *dTileCarryKeys, //output pointer to temporary array carry-out dot product row-ids, one per block
+		ValueType_ *dTileCarryValues, //output pointer to temporary array carry-out dot product row-ids, one per block
+		int numMergeTiles, //number of merge tiles
+		CsrMvParams<IndexType_, ValueType_> spParams,
+		SemiRingType_ SR)
+	{
+		int tid = (blockIdx.x * gridDim.y) + blockIdx.y; //curent tile index
+		//only continue if tid is in proper range
+		if (tid >= numMergeTiles) 
+			return;
+		Coord<IndexType_> tileStartCoord = dTileCoords[tid]; //+0 ignored
+		Coord<IndexType_> tileEndCoord = dTileCoords[tid + 1];
+
+		//Consume multi-segment tile by calling above consumeTile overloaded function
+		KeyValuePair<IndexType_, ValueType_> tileCarry = consumeTile(
+			tileStartCoord,
+			tileEndCoord,
+			spParams,
+			SR); 
+
+		//output the tile's carry out
+		if (threadIdx.x == 0)
+		{
+			if (hasAlpha)
+				tileCarry.value = SR.times(spParams.alpha, tileCarry.value);
+
+			tileCarry.key += tileStartCoord.x;
+			
+			if (tileCarry.key < spParams.m)
+			{
+				dTileCarryKeys[tid] = tileCarry.key;
+				dTileCarryValues[tid] = tileCarry.value;
+			}
+			else
+			{
+				// Make sure to reject keys larger than the matrix size directly here.
+				// printf("%d %lf\n",tileCarry.key , tileCarry.value);
+				// this patch may be obsolete after the changes related to bug#1754610
+				dTileCarryKeys[tid] = -1;
+			}
+		}
+	}
+};
+
+//this device kernel will call the above agent function-ignoring policies for now
+template <
+	typename SpmvBlockThread, //parameterized spmvpolicy tunign policy type
+	typename IndexType_, //index type either 32 bit or 64 bit integer for rowoffsets of columnindices
+	typename ValueType_, //matrix and vector value type
+	typename SemiRingType_, //this follows different semiring structs to be passed depending on the enum
+	bool hasAlpha, //determines where alpha = 1 as above
+	bool hasBeta> //determines whether beta = 0 as above
+__global__ void DeviceSpmvKernel( //this will call consume tile
+	CsrMvParams<IndexType_, ValueType_> spParams, //pass constant reference to spmv parameters
+	const SemiRingType_ &SR,
+	Coord<IndexType_> *dTileCoords, //input pointer to temporaray array of the tile starting coordinates of each (y,x) = (i,j) pair on the merge path
+	IndexType_ *dTileCarryKeys, //output is a pointer to the temp array that carries out the dot porduct row-ids where it is one per block
+	ValueType_ *dTileCarryValues, //output is a pointer to the temp array that carries out the dot porduct row-ids where it is one per block
+	int numTiles //input which is the number of merge tiles
+	)
+{
+	//call Spmv agent type specialization- need to fix this call!!
+	//now call cosntructor to initialize and consumeTile to calculate the row dot products
+	AgentSpmv<SpmvBlockThread, IndexType_, ValueType_, SemiRingType_, hasAlpha, hasBeta>().consumeTile(
+		dTileCoords, 
+		dTileCarryKeys, 
+		dTileCarryValues, 
+		numTiles, 
+		spParams,
+		SR);
+}
+
+//Helper functions for the reduction by kernel
+//for block loading block_load_vectorize for SM_30 implemenation from cub
+//Load linear segment into blocked arrangement across the thread block, guarded by range, 
+//with a fall-back assignment of -1 for out of bound
+template<int ITEMS_PER_THREAD, typename IndexType_, typename ValueType_>
+__device__ __forceinline__ void loadDirectBlocked(
+    int linearTid, //input:a  asuitable 1d thread-identifier for calling the thread
+    IndexType_ *blockItrKeys, //input: thread block's base input iterator for loading from
+    ValueType_ *blockItrValues, //input: thread block's base input iterator for loading from
+    KeyValuePair<IndexType_, ValueType_> (&items)[ITEMS_PER_THREAD], // output:data to load
+    int validItems, //input:Number of valid items to load
+    KeyValuePair<IndexType_, ValueType_> outOfBoundsDefault) //input:Default value to assign to out of bounds items -1 in this case
+{
+    #pragma unroll
+    for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+    {
+        int offset = (linearTid * ITEMS_PER_THREAD) + item;
+        // changed validItems to validItems-1 for bug#1754610 since it was causing uninitialized memory accesses here
+        items[item].key = (offset < validItems-1) ? blockItrKeys[offset] : outOfBoundsDefault.key;
+        items[item].value = (offset < validItems-1) ? blockItrValues[offset] : outOfBoundsDefault.value;
+    }
+}
+
+//load linear segment of items into a blocked arangement across a thread block
+template<int ITEMS_PER_THREAD, typename IndexType_, typename ValueType_>
+__device__ __forceinline__ void loadDirectBlocked(
+    int linearTid,
+    IndexType_ * blockItrKeys,
+    ValueType_ * blockItrValues,
+    KeyValuePair<IndexType_,ValueType_> (&items)[ITEMS_PER_THREAD])
+{
+    //Load directly in thread-blocked order
+    #pragma unroll
+    for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+    {
+        items[item].key = blockItrKeys[(linearTid *ITEMS_PER_THREAD) + item];
+        items[item].value = blockItrValues[(linearTid *ITEMS_PER_THREAD) + item];
+    }
+}
+
+//This part pertains to the fixup kernel which does a device-wide reduce-value-by-key 
+//for the thread blocks
+template<
+typename SpmvPolicyT, // parameterized SpmvBlockThread tuning policy type as listed above
+typename IndexType_,
+typename ValueType_,
+typename SemiRingType_> //matrix and vector value type
+struct AgentSegmentReduction
+{
+	//set constants
+	enum
+	{
+		BLOCK_THREADS = SpmvPolicyT::BLOCK_THREADS,
+		ITEMS_PER_THREAD = SpmvPolicyT::ITEMS_PER_THREAD,
+		TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+	};
+	
+	//This function processes an input tile and uses an atomic rewrite strategy
+	template<bool isLastTile>
+	__device__ __forceinline__ void consumeTilePost(
+		IndexType_ *dInKeys, //input array of key value pairs
+		ValueType_ *dInValues, //input array of key value pairs
+		ValueType_ *dAggregatesOut, //output value aggregates into final array y
+		IndexType_ numRemaining, //Number of global input items remaining including this tile
+		IndexType_ tileOffset, //Tile offset
+		SemiRingType_ SR
+		)
+	{
+		KeyValuePair<IndexType_,ValueType_> pairs[ITEMS_PER_THREAD];
+		KeyValuePair<IndexType_, ValueType_> outOfBoundsPair;
+		outOfBoundsPair.key = -1; //default value to assign to out of bounds items is set to be -1
+		int linearTid = threadIdx.x;
+		//load the values into pairs
+		if (isLastTile)
+		{	
+			loadDirectBlocked<ITEMS_PER_THREAD, IndexType_, ValueType_>
+							(linearTid, 
+							dInKeys + tileOffset,
+							dInValues + tileOffset,
+							pairs, 
+							numRemaining, 
+							outOfBoundsPair);
+			
+		}
+		else
+		{
+			loadDirectBlocked<ITEMS_PER_THREAD, IndexType_, ValueType_>
+							(linearTid, 
+							dInKeys + tileOffset,
+							dInValues + tileOffset,
+							pairs);
+		}
+
+		#pragma unroll
+		for (int item = 1; item < ITEMS_PER_THREAD; ++item)
+		{
+			ValueType_ *dScatter = dAggregatesOut + pairs[item-1].key; //write to correct row using the key
+			if (pairs[item].key != pairs[item-1].key)
+			{
+				SR.atomicPlus(dScatter, pairs[item -1].value);
+			}
+			else
+				pairs[item].value = SR.plus(pairs[item -1].value, pairs[item].value); //the operation is SUm
+		}
+		// Write out last item if it is valid by checking last key boolean.
+		// pairs[ITEMS_PER_THREAD - 1].key = -1 for out bound elements.
+		ValueType_ *dScatter = dAggregatesOut + pairs[ITEMS_PER_THREAD - 1].key;
+		if ((!isLastTile || pairs[ITEMS_PER_THREAD - 1].key >= 0))
+		{
+			//printf("hello %d %lf\n", pairs[ITEMS_PER_THREAD - 1].key , pairs[ITEMS_PER_THREAD -1].value);
+			SR.atomicPlus(dScatter, pairs[ITEMS_PER_THREAD -1].value);
+		}
+	}
+	//this function will call consumeTilePost and it scans the tiles of items as a part of a dynamic chained scan
+	__device__ __forceinline__ void consumeRange(
+		IndexType_ *dKeysIn, //input array of key value pairs
+		ValueType_ *dValuesIn, //input array of key value pairs
+		ValueType_ *dAggregatesOut, //output value aggregates into final array y
+		int numItems, //totall number of input items
+		int numTiles, //total number of input tiles
+		SemiRingType_ SR)
+	{
+		//Blocks are launched in increasing order, so we assign one tile per block
+		int tileIdx = (blockIdx.x * gridDim.y) + blockIdx.y; //current tile index same as in consumeTile
+		IndexType_ tileOffset = tileIdx * TILE_ITEMS; //Global offset for the current tile
+		IndexType_ numRemaining = numItems - tileOffset; //Remaining items which includes this tile
+		if (numRemaining > TILE_ITEMS) //this is not the last tile so call wit template argument set to be false
+			consumeTilePost<false>(dKeysIn, dValuesIn, dAggregatesOut, numRemaining,tileOffset, SR);
+		else if (numRemaining > 0) //this is the last tile which could be possibly partially full
+			consumeTilePost<true>(dKeysIn, dValuesIn, dAggregatesOut, numRemaining,tileOffset, SR);		
+	}
+};
+
+//Blockwide reduction by key final kernel
+template <
+typename SpmvBlockThreadSegment, //parameterized spmvpolicy tuning policy type
+typename IndexType_,
+typename ValueType_,
+typename SemiRingType_>
+__global__ void DeviceSegmentReductionByKeyKernel( //this will call consume tile
+	IndexType_ *dKeysIn, //input pointer to the arry of dot product carried out by row-ids, one per spmv block
+	ValueType_ *dValuesIn, //input pointer to the arry of dot product carried out by row-ids, one per spmv block
+	ValueType_ *dAggregatesOut, //output value aggregates - will be y-final output of method
+	IndexType_ numItems, // total number of items to select
+	int numTiles, //total number of tiles for the entire problem
+	SemiRingType_ SR)
+{
+	//now call cosntructor to initialize and consumeTile to calculate the row dot products
+	AgentSegmentReduction<SpmvBlockThreadSegment, IndexType_, ValueType_, SemiRingType_>().consumeRange(
+		dKeysIn, 
+		dValuesIn,
+		dAggregatesOut, 
+		numItems,
+		numTiles,
+		SR);
+}
+
+template<typename IndexType_,
+		 typename ValueType_,
+		 typename SemiRingType_,
+		bool hasAlpha,
+		bool hasBeta> //matrix and vector value type
+	//this is setting all the grid parameters and size
+struct DispatchSpmv
+{
+	//declare constants
+	enum
+	{
+		INIT_KERNEL_THREADS = 128
+	};
+	//sample tuning polic- can add more later
+	//SM30
+	struct Policy350 //as a sample there are many other policies to follow
+	{
+		typedef SpmvBlockThread< (sizeof(ValueType_) > 4) ? 96 : 128, //for double use 96 threads per block otherwise 128
+								 (sizeof(ValueType_) > 4) ? 4 : 4 //for double use 4 items per thread otherwise use 7
+								> SpmvPolicyT;///use instead of PtxPolicy come backa nd use cusparse to determine the architetcure 
+	};
+
+	struct Policy350Reduction //as a sample there are many other policies to follow
+	{
+		typedef SpmvBlockThread<128,3> SpmvPolicyT; //use instead of PtxPolicy come backa nd use cusparse to determine the architetcure 
+	};//for <128,1> 1 item per thread need a reduction by key
+
+	__forceinline__ static cudaError_t Dispatch(CsrMvParams<IndexType_,ValueType_> spParams, const SemiRingType_ &SR, cudaStream_t stream = 0)
+	{
+		cudaError_t error = cudaSuccess;	
+		//could move this block to initkernel fucntion 
+		int blockThreads = Policy350::SpmvPolicyT::BLOCK_THREADS;
+		int itemsPerThread = Policy350::SpmvPolicyT::ITEMS_PER_THREAD;
+
+		int blockThreadsRed = Policy350Reduction::SpmvPolicyT::BLOCK_THREADS;
+		int itemsPerThreadRed = Policy350Reduction::SpmvPolicyT::ITEMS_PER_THREAD;
+		//calculate total number of  spmv work items
+		do { //do-while loop condition at end of loop
+			//Get device ordinal
+			int deviceOrdinal, smVersion, smCount, maxDimx;
+			if (error = cudaGetDevice(&deviceOrdinal)) break;
+
+			//Get device SM version
+			if (error = SmVersion(smVersion, deviceOrdinal)) break;
+
+			//Get SM count-cudaDeviceGetAttribute is built in cuda function
+			if (error = cudaDeviceGetAttribute(&smCount, cudaDevAttrMultiProcessorCount, deviceOrdinal)) break;
+
+			//Get max dimension of the grid in the x direction
+			if (error = cudaDeviceGetAttribute(&maxDimx, cudaDevAttrMaxGridDimX, deviceOrdinal)) break;
+
+			int numMergeItems = spParams.m + spParams.nnz; //total amount of work for one diagonal/thread
+			
+			//Tile sizes of relevant kernels
+			int mergeTileSize = blockThreads * itemsPerThread; //for floats this will be a larger number
+			//and since we will be dividing by it less memory allocated for the float case
+			int segmentRedTileSize = blockThreadsRed * itemsPerThreadRed;
+
+			//Calculate number of tiles for the kernels
+			//need unsigned int to prevent underflow/overflow
+			unsigned int numMergeTiles = (numMergeItems + mergeTileSize - 1) / mergeTileSize; //launch thread number
+			unsigned int numSegmentRedTiles = (numMergeTiles + segmentRedTileSize - 1) / segmentRedTileSize;
+			//int spmv_sm_occupancy ignore maxSmOccupancy function for now and corresponding segmentfixup
+			//get grid dimensions use cuda built in dattetype dim3-has constructor with the 3 arguments
+			
+			dim3 spmvGridSize(min(numMergeTiles, (unsigned int) maxDimx),
+							  (numMergeTiles + maxDimx - 1) / maxDimx, //make sure at least 1
+							  1); //2D grid
+			//grid for second kernel
+			dim3 segmentRedGridSize(min(numSegmentRedTiles, (unsigned int) maxDimx),
+									(numSegmentRedTiles + maxDimx -1) / maxDimx,
+									1);
+			Vector<Coord<IndexType_> > dTileCoords(numMergeTiles + 1, stream);
+			Vector<IndexType_> dTileCarryKeys(numMergeTiles, stream);
+			Vector<ValueType_> dTileCarryValues(numMergeTiles, stream);
+			
+			//Get search grid dimensions
+			int searchBlockSize = INIT_KERNEL_THREADS;
+			int searchGridSize = (numMergeTiles + searchBlockSize) / searchBlockSize; //ignored the +1 -1
+			//call Search Kernel within the host so need <<>>>
+			//call devicesearch kernel to compute starting coordiantes of merge path                
+			DeviceSpmvSearchKernel<typename Policy350::SpmvPolicyT, IndexType_, ValueType_> 
+				<<<searchGridSize, searchBlockSize, 0, stream >>>(
+				numMergeTiles, 
+				dTileCoords.raw(),
+				spParams);   
+			cudaCheckError();             
+			//this will give the starting coordaintes to be called in DeviceSPmvKernel
+			
+			DeviceSpmvKernel<typename Policy350::SpmvPolicyT, IndexType_,ValueType_, SemiRingType_, hasAlpha, hasBeta>  
+				<<<spmvGridSize, blockThreads, 0, stream>>>(
+				spParams,
+				SR,
+				dTileCoords.raw(), 
+				dTileCarryKeys.raw(), 
+				dTileCarryValues.raw(),
+				numMergeTiles);                
+			cudaCheckError();
+			//Run reduce by key kernel if necessary
+			//if (error = cudaPeekAtLastError()) break; //check for failure to launch
+			if (numMergeTiles > 1)
+			{
+				DeviceSegmentReductionByKeyKernel<typename Policy350Reduction::SpmvPolicyT, IndexType_, ValueType_, SemiRingType_> 
+												<<<segmentRedGridSize, blockThreadsRed, 0>>>
+											  (dTileCarryKeys.raw(),
+											  dTileCarryValues.raw(),
+											   spParams.y, 
+											   numMergeTiles,
+											   numSegmentRedTiles,
+											   SR);
+				cudaCheckError();
+				//if (error = cudaPeekAtLastError()) break; //check for failure to launch of fixup kernel
+			}
+		} while(0); //make sure executes exactly once to give chance to break earlier with errors
+		cudaCheckError();              
+		
+		return error;
+	}
+};
+
+template<typename IndexType_, typename ValueType_, typename SemiRingType_>
+cudaError_t callDispatchSpmv(CsrMvParams<IndexType_, ValueType_> &spParams, const SemiRingType_ &SR, cudaStream_t stream = 0)
+{
+	cudaError_t error;
+	//determine semiring type
+	if (spParams.beta == SR.times_null)
+	{
+		if (spParams.alpha == SR.times_ident) //simply y = A*x
+			error =  DispatchSpmv<IndexType_, ValueType_, SemiRingType_, false, false>::Dispatch(spParams, SR, stream); //must be on the device
+		
+		else
+			error =  DispatchSpmv<IndexType_, ValueType_,SemiRingType_, true, false>::Dispatch(spParams, SR, stream); //must be passed by reference to some since writing 
+	}
+	else
+	{
+		if (spParams.alpha == SR.times_ident)
+			error =  DispatchSpmv<IndexType_, ValueType_, SemiRingType_, false, true>::Dispatch(spParams, SR, stream);
+		else
+			error =  DispatchSpmv<IndexType_, ValueType_, SemiRingType_, true, true>::Dispatch(spParams, SR, stream);
+	}
+	return error;
+}
+
+template<typename IndexType_, typename ValueType_>
+cudaError_t callSemiringSpmv(CsrMvParams<IndexType_, ValueType_> &spParams, Semiring SR, cudaStream_t stream = 0)
+{
+    // This is dangerous but we need to initialize this value, probably it's
+    // better to return success than to return some misleading error code
+	cudaError_t error = cudaSuccess;
+	switch(SR)
+	{
+		case PlusTimes:
+		{
+			PlusTimesSemiring<ValueType_> plustimes; //can be float or double for real case
+			error =  callDispatchSpmv(spParams, plustimes, stream);
+		}
+		break;	
+		case MinPlus:
+		{
+			MinPlusSemiring<ValueType_> minplus;
+			error =  callDispatchSpmv(spParams, minplus, stream);
+		}
+		break;
+		case MaxMin:
+		{
+			MaxMinSemiring<ValueType_> maxmin;
+			error =  callDispatchSpmv(spParams, maxmin, stream);
+		}
+		break;
+		case OrAndBool:
+		{
+			OrAndBoolSemiring<ValueType_> orandbool;
+			error =  callDispatchSpmv(spParams, orandbool, stream);
+		}
+		break;
+		case LogPlus:
+		{
+			LogPlusSemiring<ValueType_> logplus;
+			error =  callDispatchSpmv(spParams, logplus, stream);
+		}
+		break;
+	}
+	return error;
+}
+
+//create a device function interface to call the above dispatch function
+template <typename IndexType_, typename ValueType_>
+cudaError_t csrmv_mp(
+	IndexType_ n,
+	IndexType_ m, 
+	IndexType_ nnz,
+	ValueType_ alpha,
+	ValueType_ * dValues, //all must be preallocated on the device
+	IndexType_ * dRowOffsets,
+	IndexType_ * dColIndices,
+	ValueType_ *dVectorX,
+	ValueType_ beta,
+	ValueType_ *dVectorY,
+	Semiring SR, 
+	cudaStream_t stream)
+{ //create user interface
+	 //calling device kernel depends on tempalte boolean parameters fro alpha/beta
+	//Set parameters for struct
+	CsrMvParams<IndexType_, ValueType_> spParams;
+	spParams.m = m;
+	spParams.n = n;
+	spParams.nnz = nnz;
+	spParams.alpha = alpha;
+	spParams.beta = beta;
+	spParams.csrRowPtr = dRowOffsets + 1; //ignore first 0 component in merge path specific for this spmv only
+	spParams.csrVal = dValues;
+	spParams.csrColInd = dColIndices;
+	spParams.x = dVectorX;
+	spParams.y = dVectorY;
+
+	return callSemiringSpmv(spParams, SR, stream);
+}
+
+
+template<typename IndexType_, typename ValueType_>
+cudaError_t csrmv_mp(
+	IndexType_ n,
+	IndexType_ m,
+	IndexType_ nnz,
+	ValueType_ alpha,
+	ValuedCsrGraph <IndexType_, ValueType_> network,
+	ValueType_ *dVectorX,
+	ValueType_ beta,
+	ValueType_ *dVectorY,
+	Semiring SR, 
+	cudaStream_t stream
+	)
+{
+	 //calling device kernel depends on tempalte boolean parameters fro alpha/beta
+	//Set parameters for struct
+
+	CsrMvParams<IndexType_, ValueType_> spParams;
+	spParams.m = m;
+	spParams.n = n;
+	spParams.nnz = nnz;
+	spParams.alpha = alpha;
+	spParams.beta = beta;
+	spParams.csrRowPtr = network.get_raw_row_offsets() + 1; //ignore first 0 component in merge path specific for this spmv only
+	spParams.csrVal = network.get_raw_values();
+	spParams.csrColInd = network.get_raw_column_indices();
+	spParams.x = dVectorX;
+	spParams.y = dVectorY;
+
+	return callSemiringSpmv(spParams, SR, stream);
+}
+
+//declare template types to be called
+template cudaError_t csrmv_mp<int, double>(
+	int n,
+	int m, 
+	int nnz,
+	double alpha,
+	double * dValues, //all must be preallocated on the device
+	int * dRowOffsets,
+	int * dColIndices,
+	double *dVectorX,
+	double beta,
+	double *dVectorY,
+	Semiring SR, 
+	cudaStream_t stream
+	);
+
+template cudaError_t csrmv_mp<long long, double>(
+	long long n,
+	long long m, 
+	long long nnz,
+	double alpha,
+	double * dValues, //all must be preallocated on the device
+	long long * dRowOffsets,
+	long long * dColIndices,
+	double *dVectorX,
+	double beta,
+	double *dVectorY,
+	Semiring SR, 
+	cudaStream_t stream
+	);
+
+template cudaError_t csrmv_mp<int, float>(
+	int n,
+	int m, 
+	int nnz,
+	float alpha,
+	float * dValues, //all must be preallocated on the device
+	int * dRowOffsets,
+	int * dColIndices,
+	float *dVectorX,
+	float beta,
+	float *dVectorY,
+	Semiring SR, 
+	cudaStream_t stream
+	);
+//for 64 bit support which may not be needed
+template cudaError_t csrmv_mp<long long, float>(
+	long long n,
+	long long m, 
+	long long nnz,
+	float alpha,
+	float * dValues, //all must be preallocated on the device
+	long long * dRowOffsets,
+	long long * dColIndices,
+	float *dVectorX,
+	float beta,
+	float *dVectorY,
+	Semiring SR, 
+	cudaStream_t stream
+	);
+//assume embedding booleans in the reals
+/*template cudaError_t csrmv_mp<int, bool>(
+	int n,
+	int m, 
+	int nnz,
+	bool alpha,
+	bool * dValues, //all must be preallocated on the device
+	int * dRowOffsets,
+	int * dColIndices,
+	bool *dVectorX,
+	bool beta,
+	bool *dVectorY,
+	Semiring SR
+	);
+//for 64 bit support which may not be needed
+template cudaError_t csrmv_mp<long long, bool>(
+	long long n,
+	long long m, 
+	long long nnz,
+	bool alpha,
+	bool * dValues, //all must be preallocated on the device
+	long long * dRowOffsets,
+	long long * dColIndices,
+	bool *dVectorX,
+	bool beta,
+	bool *dVectorY,
+	Semiring SR
+	);*/
+
+//declare template types to be called using valued_csr_graph version
+template cudaError_t csrmv_mp<int, double>(
+	int n,
+	int m, 
+	int nnz,
+	double alpha,
+	ValuedCsrGraph <int, double> network,
+	double *dVectorX,
+	double beta,
+	double *dVectorY,
+	Semiring SR, 
+	cudaStream_t stream
+	);
+
+template cudaError_t csrmv_mp<long long, double>(
+	long long n,
+	long long m, 
+	long long nnz,
+	double alpha,
+	ValuedCsrGraph <long long, double> network,
+	double *dVectorX,
+	double beta,
+	double *dVectorY,
+	Semiring SR, 
+	cudaStream_t stream
+	);
+
+template cudaError_t csrmv_mp<int, float>(
+	int n,
+	int m, 
+	int nnz,
+	float alpha,
+	ValuedCsrGraph <int, float> network,
+	float *dVectorX,
+	float beta,
+	float *dVectorY,
+	Semiring SR, 
+	cudaStream_t stream
+	);
+//for 64 bit support which may not be needed
+template cudaError_t csrmv_mp<long long, float>(
+	long long n,
+	long long m, 
+	long long nnz,
+	float alpha,
+	ValuedCsrGraph <long long, float> network,
+	float *dVectorX,
+	float beta,
+	float *dVectorY,
+	Semiring SR, 
+	cudaStream_t stream
+	);
+
+/*template cudaError_t csrmv_mp<int, bool>(
+	int n,
+	int m, 
+	int nnz,
+	bool alpha,
+	ValuedCsrGraph <int, bool> network,
+	bool *dVectorX,
+	bool beta,
+	bool *dVectorY,
+	Semiring SR
+	);
+//for 64 bit support which may not be needed
+template cudaError_t csrmv_mp<long long, bool>(
+	long long n,
+	long long m, 
+	long long nnz,
+	bool alpha,
+	ValuedCsrGraph <long long, bool> network,
+	bool *dVectorX,
+	bool beta,
+	bool *dVectorY,
+	Semiring SR
+	);*/
+
+} //end namespace nvgraph
+
+using namespace nvgraph;
+
+//this is the standard kernel used to test the semiring operations	
+template<typename IndexType_, typename ValueType_, typename SemiRingType_>
+ __global__ void csrmv(IndexType_ num_rows, IndexType_ *dRowOffsets, IndexType_ *dColIndices, ValueType_ *dValues, 
+ 	ValueType_ *dVectorX, ValueType_ *dVectorY, SemiRingType_ SR, ValueType_ alpha, ValueType_ beta) 
+{
+	int row = blockDim.x * blockIdx.x + threadIdx.x ;
+	if (row < num_rows) 
+	{
+		ValueType_ dot;
+		SR.setPlus_ident(dot);
+		//SR.setPlus_ident(dVectorY[row]); //need to initialize y outside
+		IndexType_ row_start = dRowOffsets[row];
+		IndexType_ row_end = dRowOffsets[row + 1];
+		for (int i = row_start; i < row_end; i++) 
+		{
+			dot = SR.plus(SR.times(alpha,SR.times(dValues[i], dVectorX[dColIndices[i]])), dot);
+		}
+		dVectorY[row] = SR.plus(dot, (SR.times(beta, dVectorY[row])));
+	}
+}	
+
+template<typename IndexType_, typename ValueType_>
+void callTestCsrmv(IndexType_ num_rows, IndexType_ *dRowOffsets, IndexType_ *dColIndices, ValueType_ *dValues, 
+ 	ValueType_ *dVectorX, ValueType_ *dVectorY, nvgraph::Semiring SR, ValueType_ alpha, ValueType_ beta)
+{
+	const int side = 2048;
+	const int numThreads = 256;
+	const int numBlocks = (side * side + numThreads - 1) / numThreads;
+	switch(SR)
+	{
+		case nvgraph::PlusTimes:
+		{
+			nvgraph::PlusTimesSemiring<ValueType_> plustimes; //can be float or double for real case
+			csrmv<<<numBlocks, numThreads>>>(num_rows, dRowOffsets, dColIndices, dValues, dVectorX, dVectorY, plustimes, alpha, beta);
+		}
+		break;	
+		case nvgraph::MinPlus:
+		{
+			nvgraph::MinPlusSemiring<ValueType_> minplus;
+			csrmv<<<numBlocks, numThreads>>>(num_rows, dRowOffsets, dColIndices, dValues, dVectorX, dVectorY, minplus, alpha, beta);
+		}
+		break;
+		case nvgraph::MaxMin:
+		{
+			nvgraph::MaxMinSemiring<ValueType_> maxmin;
+			csrmv<<<numBlocks, numThreads>>>(num_rows, dRowOffsets, dColIndices, dValues, dVectorX, dVectorY, maxmin, alpha, beta);
+		}
+		break;
+		case nvgraph::OrAndBool:
+		{
+			nvgraph::OrAndBoolSemiring<ValueType_> orandbool;
+			csrmv<<<numBlocks, numThreads>>>(num_rows, dRowOffsets, dColIndices, dValues, dVectorX, dVectorY, orandbool, alpha, beta);
+		}
+		break;
+		case nvgraph::LogPlus:
+		{
+			nvgraph::LogPlusSemiring<ValueType_> logplus;
+			csrmv<<<numBlocks, numThreads>>>(num_rows, dRowOffsets, dColIndices, dValues, dVectorX, dVectorY, logplus, alpha, beta);
+		}
+		break;
+	}
+	cudaCheckError();
+
+}
+
+template void callTestCsrmv<int, float>(int num_rows, int *dRowOffsets, int*dColIndices, float *dValues, 
+ 	float *dVectorX, float *dVectorY, nvgraph::Semiring SR, float alpha, float beta);
+
+template void callTestCsrmv<int, double>(int num_rows, int *dRowOffsets, int*dColIndices, double *dValues, 
+ double *dVectorX, double *dVectorY, nvgraph::Semiring SR, double alpha, double beta);	
+
diff --git a/cpp/nvgraph/cpp/src/csrmv_cub.cu b/cpp/nvgraph/cpp/src/csrmv_cub.cu
new file mode 100644
index 00000000000..a008356fcf7
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/csrmv_cub.cu
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nvgraph.h"
+#include "nvgraphP.h"
+#include "nvgraph_error.hxx"
+#include "csrmv_cub.h"
+
+#include "cub_semiring/cub.cuh"
+
+namespace nvgraph
+{
+
+template <typename I, typename V>template <typename SR>
+NVGRAPH_ERROR SemiringDispatch<I, V>::Dispatch(
+        const V*             d_values,
+        const I*             d_row_offsets,
+        const I*             d_column_indices,
+        const V*             d_vector_x,
+        V*             d_vector_y,
+        V              alpha,
+        V              beta, 
+        I              num_rows,
+        I              num_cols,
+        I              num_nonzeros,
+        cudaStream_t   stream)
+{
+    // std::static_assert(std::is_same<typename std::remove_cv<T>::type, int>::value, "current CUB implementation supports int only for indices");
+    size_t temp_buf_size = 0;
+    cudaError_t err = cub_semiring::cub::DeviceSpmv::CsrMV<V, SR>( NULL, temp_buf_size, d_values, d_row_offsets, d_column_indices, d_vector_x, 
+        d_vector_y, alpha, beta, num_rows, num_cols, num_nonzeros, stream);
+    CHECK_CUDA(err);
+    Vector<char> tmp_buf(std::max(temp_buf_size, size_t(1)), stream);
+    err = cub_semiring::cub::DeviceSpmv::CsrMV<V, SR>( tmp_buf.raw(), temp_buf_size, d_values, d_row_offsets, d_column_indices, d_vector_x, 
+        d_vector_y, alpha, beta, num_rows, num_cols, num_nonzeros, stream);
+    CHECK_CUDA(err);
+    return NVGRAPH_OK;
+};
+
+// deconstructs graph, checks parameters and dispatches semiring implementation
+template <typename I, typename V>
+NVGRAPH_ERROR SemiringDispatch<I, V>::InitAndLaunch(
+            const nvgraph::MultiValuedCsrGraph<I, V> &graph,
+            const size_t weight_index,
+            const void *p_alpha,
+            const size_t x_index,
+            const void *p_beta,
+            const size_t y_index,
+            const nvgraphSemiring_t SR,
+            cudaStream_t stream
+        )
+{
+    if (weight_index >= graph.get_num_edge_dim() || x_index >= graph.get_num_vertex_dim() || y_index >= graph.get_num_vertex_dim()) // base index is 0
+        return NVGRAPH_ERR_BAD_PARAMETERS;
+    I n = static_cast<I>(graph.get_num_vertices());
+    I nnz = static_cast<I>(graph.get_num_edges());
+    const V* vals = graph.get_raw_edge_dim(weight_index);
+    const V* x = graph.get_raw_vertex_dim( x_index);
+    V* y = const_cast<V*>(graph.get_raw_vertex_dim(y_index));
+    V alpha = *(static_cast<const V*>(p_alpha));
+    V beta = *(static_cast<const V*>(p_beta));
+    const I* row_ptr = graph.get_raw_row_offsets();
+    const I* col_ind = graph.get_raw_column_indices(); 
+    
+    NVGRAPH_ERROR err = NVGRAPH_ERR_BAD_PARAMETERS;
+
+    switch (SR)
+    {
+        case NVGRAPH_PLUS_TIMES_SR: 
+            err = Dispatch< cub_semiring::cub::PlusTimesSemiring<V> >(vals, row_ptr, col_ind, x, y, alpha, beta, n, n, nnz, stream);
+            break;
+        case NVGRAPH_MIN_PLUS_SR: 
+            err = Dispatch< cub_semiring::cub::MinPlusSemiring<V> >(vals, row_ptr, col_ind, x, y, alpha, beta, n, n, nnz, stream);
+            break;
+        case NVGRAPH_MAX_MIN_SR: 
+            err = Dispatch< cub_semiring::cub::MaxMinSemiring<V> >(vals, row_ptr, col_ind, x, y, alpha, beta, n, n, nnz, stream);
+            break;
+        case NVGRAPH_OR_AND_SR:
+            err = Dispatch< cub_semiring::cub::OrAndBoolSemiring<V> >(vals, row_ptr, col_ind, x, y, alpha, beta, n, n, nnz, stream);
+            break;
+        default:
+            break;
+    }
+    return err;
+};
+
+// API wrapper to avoid bloating main API object nvgraph.cpp
+NVGRAPH_ERROR SemiringAPILauncher(nvgraphHandle_t handle,
+                           const nvgraphGraphDescr_t descrG,
+                           const size_t weight_index,
+                           const void *alpha,
+                           const size_t x,
+                           const void *beta,
+                           const size_t y,
+                           const nvgraphSemiring_t sr)
+{
+    typedef int I;
+
+    if (descrG->graphStatus!=HAS_VALUES) // need a MultiValuedCsrGraph
+        return NVGRAPH_ERR_BAD_PARAMETERS;
+
+    if (descrG->TT != NVGRAPH_CSR_32) // supported topologies
+        return NVGRAPH_ERR_BAD_PARAMETERS;
+
+    cudaStream_t stream = handle->stream;
+
+    NVGRAPH_ERROR err = NVGRAPH_ERR_NOT_IMPLEMENTED; 
+
+    switch(descrG->T)
+        {
+            case CUDA_R_32F :
+            {
+                const nvgraph::MultiValuedCsrGraph<I, float> *mcsrg = static_cast<const nvgraph::MultiValuedCsrGraph<I, float>*> (descrG->graph_handle);
+                err = SemiringDispatch<I, float>::InitAndLaunch( *mcsrg, weight_index, static_cast<const float*>(alpha), x, 
+                    static_cast<const float*>(beta), y, sr, stream);
+                break;
+            }
+            case CUDA_R_64F :
+            {
+                const nvgraph::MultiValuedCsrGraph<I, double> *mcsrg = static_cast<const nvgraph::MultiValuedCsrGraph<I, double>*> (descrG->graph_handle);
+                err = SemiringDispatch<I, double>::InitAndLaunch( *mcsrg, weight_index, static_cast<const double*>(alpha), x, 
+                    static_cast<const double*>(beta), y, sr, stream);
+                break;
+            }
+            default:
+                break;
+        }
+    return err;
+};
+
+} //namespace nvgraph
diff --git a/cpp/nvgraph/cpp/src/graph_contraction/contraction_csr_max.cu b/cpp/nvgraph/cpp/src/graph_contraction/contraction_csr_max.cu
new file mode 100644
index 00000000000..ef8eae1678d
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/graph_contraction/contraction_csr_max.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <graph_contracting_visitor.hxx>
+
+namespace nvgraph
+{
+  //------------------------- Graph Contraction: ----------------------
+  //
+  CsrGraph<int>* contract_graph_csr_max(CsrGraph<int>& graph,
+                                int* pV, size_t n,
+                                cudaStream_t stream,
+                                const int& VCombine,
+                                const int& VReduce,
+                                const int& ECombine,
+                                const int& EReduce)
+  {
+    return contract_from_aggregates_t<int, double, SemiRingFctrSelector<Max, double>::FctrType >(graph, pV, n, stream,
+                                                                                                       static_cast<SemiRingFunctorTypes>(VCombine),
+                                                                                                       static_cast<SemiRingFunctorTypes>(VReduce),
+                                                                                                       static_cast<SemiRingFunctorTypes>(ECombine),
+                                                                                                       static_cast<SemiRingFunctorTypes>(EReduce));
+  }
+ 
+}
diff --git a/cpp/nvgraph/cpp/src/graph_contraction/contraction_csr_min.cu b/cpp/nvgraph/cpp/src/graph_contraction/contraction_csr_min.cu
new file mode 100644
index 00000000000..38f4cbbd19f
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/graph_contraction/contraction_csr_min.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <graph_contracting_visitor.hxx>
+
+namespace nvgraph
+{
+  //------------------------- Graph Contraction: ----------------------
+  //
+  CsrGraph<int>* contract_graph_csr_min(CsrGraph<int>& graph,
+                                int* pV, size_t n,
+                                cudaStream_t stream,
+                                const int& VCombine,
+                                const int& VReduce,
+                                const int& ECombine,
+                                const int& EReduce)
+  {
+    return contract_from_aggregates_t<int, double, SemiRingFctrSelector<Min, double>::FctrType >(graph, pV, n, stream,
+                                                                                                       static_cast<SemiRingFunctorTypes>(VCombine),
+                                                                                                       static_cast<SemiRingFunctorTypes>(VReduce),
+                                                                                                       static_cast<SemiRingFunctorTypes>(ECombine),
+                                                                                                       static_cast<SemiRingFunctorTypes>(EReduce));
+  }
+ 
+}
diff --git a/cpp/nvgraph/cpp/src/graph_contraction/contraction_csr_mul.cu b/cpp/nvgraph/cpp/src/graph_contraction/contraction_csr_mul.cu
new file mode 100644
index 00000000000..77b8534a757
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/graph_contraction/contraction_csr_mul.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <graph_contracting_visitor.hxx>
+
+namespace nvgraph
+{
+  //------------------------- Graph Contraction: ----------------------
+  //
+  CsrGraph<int>* contract_graph_csr_mul(CsrGraph<int>& graph,
+                                int* pV, size_t n,
+                                cudaStream_t stream,
+                                const int& VCombine,
+                                const int& VReduce,
+                                const int& ECombine,
+                                const int& EReduce)
+  {
+    return contract_from_aggregates_t<int, double, SemiRingFctrSelector<Multiply, double>::FctrType >(graph, pV, n, stream,
+                                                                                                       static_cast<SemiRingFunctorTypes>(VCombine),
+                                                                                                       static_cast<SemiRingFunctorTypes>(VReduce),
+                                                                                                       static_cast<SemiRingFunctorTypes>(ECombine),
+                                                                                                       static_cast<SemiRingFunctorTypes>(EReduce));
+  }
+ 
+}
diff --git a/cpp/nvgraph/cpp/src/graph_contraction/contraction_csr_sum.cu b/cpp/nvgraph/cpp/src/graph_contraction/contraction_csr_sum.cu
new file mode 100644
index 00000000000..23d203e0801
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/graph_contraction/contraction_csr_sum.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <graph_contracting_visitor.hxx>
+
+namespace nvgraph
+{
+  //------------------------- Graph Contraction: ----------------------
+  //
+  CsrGraph<int>* contract_graph_csr_sum(CsrGraph<int>& graph,
+                                int* pV, size_t n,
+                                cudaStream_t stream,
+                                const int& VCombine,
+                                const int& VReduce,
+                                const int& ECombine,
+                                const int& EReduce)
+  {
+    return contract_from_aggregates_t<int, double, SemiRingFctrSelector<Sum, double>::FctrType >(graph, pV, n, stream,
+                                                                                                       static_cast<SemiRingFunctorTypes>(VCombine),
+                                                                                                       static_cast<SemiRingFunctorTypes>(VReduce),
+                                                                                                       static_cast<SemiRingFunctorTypes>(ECombine),
+                                                                                                       static_cast<SemiRingFunctorTypes>(EReduce));
+  }
+ 
+}
diff --git a/cpp/nvgraph/cpp/src/graph_contraction/contraction_mv_double_max.cu b/cpp/nvgraph/cpp/src/graph_contraction/contraction_mv_double_max.cu
new file mode 100644
index 00000000000..da1334f75d1
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/graph_contraction/contraction_mv_double_max.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <graph_contracting_visitor.hxx>
+
+namespace nvgraph
+{
+  //------------------------- Graph Contraction: ----------------------
+  //
+  MultiValuedCsrGraph<int, double>* contract_graph_mv_double_max(MultiValuedCsrGraph<int, double>& graph, 
+                                                  int* pV, size_t n,
+                                                  cudaStream_t stream,
+                                                  const int& VCombine,
+                                                  const int& VReduce,
+                                                  const int& ECombine,
+                                                  const int& EReduce)
+  {
+    return static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(contract_from_aggregates_t<int, double, SemiRingFctrSelector<Max, double>::FctrType >(graph, pV, n, stream,
+                                                                                                       static_cast<SemiRingFunctorTypes>(VCombine),
+                                                                                                       static_cast<SemiRingFunctorTypes>(VReduce),
+                                                                                                       static_cast<SemiRingFunctorTypes>(ECombine),
+                                                                                                       static_cast<SemiRingFunctorTypes>(EReduce)));
+  }
+ 
+}
diff --git a/cpp/nvgraph/cpp/src/graph_contraction/contraction_mv_double_min.cu b/cpp/nvgraph/cpp/src/graph_contraction/contraction_mv_double_min.cu
new file mode 100644
index 00000000000..5f59f75d661
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/graph_contraction/contraction_mv_double_min.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <graph_contracting_visitor.hxx>
+
+namespace nvgraph
+{
+  //------------------------- Graph Contraction: ----------------------
+  //
+  MultiValuedCsrGraph<int, double>* contract_graph_mv_double_min(MultiValuedCsrGraph<int, double>& graph, 
+                                                  int* pV, size_t n,
+                                                  cudaStream_t stream,
+                                                  const int& VCombine,
+                                                  const int& VReduce,
+                                                  const int& ECombine,
+                                                  const int& EReduce)
+  {
+    return static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(contract_from_aggregates_t<int, double, SemiRingFctrSelector<Min, double>::FctrType >(graph, pV, n, stream,
+                                                                                                       static_cast<SemiRingFunctorTypes>(VCombine),
+                                                                                                       static_cast<SemiRingFunctorTypes>(VReduce),
+                                                                                                       static_cast<SemiRingFunctorTypes>(ECombine),
+                                                                                                       static_cast<SemiRingFunctorTypes>(EReduce)));
+  }
+ 
+}
diff --git a/cpp/nvgraph/cpp/src/graph_contraction/contraction_mv_double_mul.cu b/cpp/nvgraph/cpp/src/graph_contraction/contraction_mv_double_mul.cu
new file mode 100644
index 00000000000..3aa99ed0d32
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/graph_contraction/contraction_mv_double_mul.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <graph_contracting_visitor.hxx>
+
+namespace nvgraph
+{
+  //------------------------- Graph Contraction: ----------------------
+  //
+  MultiValuedCsrGraph<int, double>* contract_graph_mv_double_mul(MultiValuedCsrGraph<int, double>& graph, 
+                                                  int* pV, size_t n,
+                                                  cudaStream_t stream,
+                                                  const int& VCombine,
+                                                  const int& VReduce,
+                                                  const int& ECombine,
+                                                  const int& EReduce)
+  {
+    return static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(contract_from_aggregates_t<int, double, SemiRingFctrSelector<Multiply, double>::FctrType >(graph, pV, n, stream,
+                                                                                                       static_cast<SemiRingFunctorTypes>(VCombine),
+                                                                                                       static_cast<SemiRingFunctorTypes>(VReduce),
+                                                                                                       static_cast<SemiRingFunctorTypes>(ECombine),
+                                                                                                       static_cast<SemiRingFunctorTypes>(EReduce)));
+  }
+ 
+}
diff --git a/cpp/nvgraph/cpp/src/graph_contraction/contraction_mv_double_sum.cu b/cpp/nvgraph/cpp/src/graph_contraction/contraction_mv_double_sum.cu
new file mode 100644
index 00000000000..661f7ac4a32
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/graph_contraction/contraction_mv_double_sum.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <graph_contracting_visitor.hxx>
+
+namespace nvgraph
+{
+  //------------------------- Graph Contraction: ----------------------
+  //
+  MultiValuedCsrGraph<int, double>* contract_graph_mv_double_sum(MultiValuedCsrGraph<int, double>& graph, 
+                                                  int* pV, size_t n,
+                                                  cudaStream_t stream,
+                                                  const int& VCombine,
+                                                  const int& VReduce,
+                                                  const int& ECombine,
+                                                  const int& EReduce)
+  {
+    return static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(contract_from_aggregates_t<int, double, SemiRingFctrSelector<Sum, double>::FctrType >(graph, pV, n, stream,
+                                                                                                       static_cast<SemiRingFunctorTypes>(VCombine),
+                                                                                                       static_cast<SemiRingFunctorTypes>(VReduce),
+                                                                                                       static_cast<SemiRingFunctorTypes>(ECombine),
+                                                                                                       static_cast<SemiRingFunctorTypes>(EReduce)));
+  }
+ 
+}
diff --git a/cpp/nvgraph/cpp/src/graph_contraction/contraction_mv_float_max.cu b/cpp/nvgraph/cpp/src/graph_contraction/contraction_mv_float_max.cu
new file mode 100644
index 00000000000..65367b42a34
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/graph_contraction/contraction_mv_float_max.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <graph_contracting_visitor.hxx>
+
+namespace nvgraph
+{
+  //------------------------- Graph Contraction: ----------------------
+  //
+  MultiValuedCsrGraph<int, float>* contract_graph_mv_float_max(MultiValuedCsrGraph<int, float>& graph, 
+                                                  int* pV, size_t n,
+                                                  cudaStream_t stream,
+                                                  const int& VCombine,
+                                                  const int& VReduce,
+                                                  const int& ECombine,
+                                                  const int& EReduce)
+  {
+    return static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(contract_from_aggregates_t<int, float, SemiRingFctrSelector<Max, float>::FctrType >(graph, pV, n, stream,
+                                                                                                       static_cast<SemiRingFunctorTypes>(VCombine),
+                                                                                                       static_cast<SemiRingFunctorTypes>(VReduce),
+                                                                                                       static_cast<SemiRingFunctorTypes>(ECombine),
+                                                                                                       static_cast<SemiRingFunctorTypes>(EReduce)));
+  }
+ 
+}
diff --git a/cpp/nvgraph/cpp/src/graph_contraction/contraction_mv_float_min.cu b/cpp/nvgraph/cpp/src/graph_contraction/contraction_mv_float_min.cu
new file mode 100644
index 00000000000..d8930470410
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/graph_contraction/contraction_mv_float_min.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <graph_contracting_visitor.hxx>
+
+namespace nvgraph
+{
+  //------------------------- Graph Contraction: ----------------------
+  //
+  MultiValuedCsrGraph<int, float>* contract_graph_mv_float_min(MultiValuedCsrGraph<int, float>& graph, 
+                                                  int* pV, size_t n,
+                                                  cudaStream_t stream,
+                                                  const int& VCombine,
+                                                  const int& VReduce,
+                                                  const int& ECombine,
+                                                  const int& EReduce)
+  {
+    return static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(contract_from_aggregates_t<int, float, SemiRingFctrSelector<Min, float>::FctrType >(graph, pV, n, stream,
+                                                                                                       static_cast<SemiRingFunctorTypes>(VCombine),
+                                                                                                       static_cast<SemiRingFunctorTypes>(VReduce),
+                                                                                                       static_cast<SemiRingFunctorTypes>(ECombine),
+                                                                                                       static_cast<SemiRingFunctorTypes>(EReduce)));
+  }
+ 
+}
diff --git a/cpp/nvgraph/cpp/src/graph_contraction/contraction_mv_float_mul.cu b/cpp/nvgraph/cpp/src/graph_contraction/contraction_mv_float_mul.cu
new file mode 100644
index 00000000000..d3686827e77
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/graph_contraction/contraction_mv_float_mul.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <graph_contracting_visitor.hxx>
+
+namespace nvgraph
+{
+  //------------------------- Graph Contraction: ----------------------
+  //
+  MultiValuedCsrGraph<int, float>* contract_graph_mv_float_mul(MultiValuedCsrGraph<int, float>& graph, 
+                                                  int* pV, size_t n,
+                                                  cudaStream_t stream,
+                                                  const int& VCombine,
+                                                  const int& VReduce,
+                                                  const int& ECombine,
+                                                  const int& EReduce)
+  {
+    return static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(contract_from_aggregates_t<int, float, SemiRingFctrSelector<Multiply, float>::FctrType >(graph, pV, n, stream,
+                                                                                                       static_cast<SemiRingFunctorTypes>(VCombine),
+                                                                                                       static_cast<SemiRingFunctorTypes>(VReduce),
+                                                                                                       static_cast<SemiRingFunctorTypes>(ECombine),
+                                                                                                       static_cast<SemiRingFunctorTypes>(EReduce)));
+  }
+ 
+}
diff --git a/cpp/nvgraph/cpp/src/graph_contraction/contraction_mv_float_sum.cu b/cpp/nvgraph/cpp/src/graph_contraction/contraction_mv_float_sum.cu
new file mode 100644
index 00000000000..e7ffdc9c05f
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/graph_contraction/contraction_mv_float_sum.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <graph_contracting_visitor.hxx>
+
+namespace nvgraph
+{
+  //------------------------- Graph Contraction: ----------------------
+  //
+  MultiValuedCsrGraph<int, float>* contract_graph_mv_float_sum(MultiValuedCsrGraph<int, float>& graph, 
+                                                  int* pV, size_t n,
+                                                  cudaStream_t stream,
+                                                  const int& VCombine,
+                                                  const int& VReduce,
+                                                  const int& ECombine,
+                                                  const int& EReduce)
+  {
+    return static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(contract_from_aggregates_t<int, float, SemiRingFctrSelector<Sum, float>::FctrType >(graph, pV, n, stream,
+                                                                                                       static_cast<SemiRingFunctorTypes>(VCombine),
+                                                                                                       static_cast<SemiRingFunctorTypes>(VReduce),
+                                                                                                       static_cast<SemiRingFunctorTypes>(ECombine),
+                                                                                                       static_cast<SemiRingFunctorTypes>(EReduce)));
+  }
+ 
+}
diff --git a/cpp/nvgraph/cpp/src/graph_extractor.cu b/cpp/nvgraph/cpp/src/graph_extractor.cu
new file mode 100644
index 00000000000..b8000f819a0
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/graph_extractor.cu
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <graph_concrete_visitors.hxx>
+
+
+
+namespace nvgraph
+{
+  //------------------------- SubGraph Extraction: ----------------------
+  //
+  CsrGraph<int>* extract_subgraph_by_vertices(CsrGraph<int>& graph,
+											  int* pV, size_t n, cudaStream_t stream)
+  {
+	return extract_from_vertex_subset<int, double>(graph, pV, n, stream);
+  }
+
+  MultiValuedCsrGraph<int, float>* extract_subgraph_by_vertices(MultiValuedCsrGraph<int, float>& graph, 
+																int* pV, size_t n, cudaStream_t stream)
+  {
+	return static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(extract_from_vertex_subset<int, float>(graph, pV, n, stream));
+  }
+
+  MultiValuedCsrGraph<int, double>* extract_subgraph_by_vertices(MultiValuedCsrGraph<int, double>& graph, 
+																int* pV, size_t n, cudaStream_t stream)
+  {
+	return static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(extract_from_vertex_subset<int, double>(graph, pV, n, stream));
+  }
+
+  CsrGraph<int>* extract_subgraph_by_edges(CsrGraph<int>& graph,
+										   int* pV, size_t n, cudaStream_t stream)
+  {
+	return extract_from_edge_subset<int, double>(graph, pV, n, stream);
+  }
+
+  MultiValuedCsrGraph<int, float>* extract_subgraph_by_edges(MultiValuedCsrGraph<int, float>& graph,
+															 int* pV, size_t n, cudaStream_t stream)
+  {
+	return static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(extract_from_edge_subset<int, float>(graph, pV, n, stream));
+  }
+
+  MultiValuedCsrGraph<int, double>* extract_subgraph_by_edges(MultiValuedCsrGraph<int, double>& graph,
+															 int* pV, size_t n, cudaStream_t stream)
+  {
+	return static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(extract_from_edge_subset<int, double>(graph, pV, n, stream));
+  }
+
+
+  
+
+	
+  
+}// end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/src/jaccard_gpu.cu b/cpp/nvgraph/cpp/src/jaccard_gpu.cu
new file mode 100644
index 00000000000..77182eb7349
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/jaccard_gpu.cu
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Jaccard symilarity edge weights
+// Author: Alexandre Fender afender@nvidia.com and Maxim Naumov.
+
+#include "graph_utils.cuh"
+#include "jaccard_gpu.cuh"
+
+namespace nvlouvain 
+{
+
+//#define CUDA_MAX_BLOCKS 65535
+//#define CUDA_MAX_KERNEL_THREADS 256  //kernel will launch at most 256 threads per block
+//#define DEFAULT_MASK 0xffffffff
+
+// Volume of neighboors (*weight_s)
+template<bool weighted, typename T>
+__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
+jaccard_row_sum(int n, int e, int *csrPtr, int *csrInd, T *v, T *work) {
+    int row,start,end,length;
+    T sum;
+
+    for (row=threadIdx.y+blockIdx.y*blockDim.y; row<n; row+=gridDim.y*blockDim.y) {
+        start = csrPtr[row];
+        end   = csrPtr[row+1];
+        length= end-start;
+        //compute row sums 
+        if (weighted) {
+            sum = parallel_prefix_sum(length, csrInd + start, v); 
+            if (threadIdx.x == 0) work[row] = sum;
+        }
+        else {
+            work[row] = (T)length;
+        }
+    }
+}
+
+// Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s)
+template<bool weighted, typename T>
+__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
+jaccard_is(int n, int e, int *csrPtr, int *csrInd, T *v, T *work, T *weight_i, T *weight_s) {
+    int i,j,row,col,Ni,Nj;
+    int ref,cur,ref_col,cur_col,match;
+    T ref_val;
+
+    for (row=threadIdx.z+blockIdx.z*blockDim.z; row<n; row+=gridDim.z*blockDim.z) {  
+        for (j=csrPtr[row]+threadIdx.y+blockIdx.y*blockDim.y; j<csrPtr[row+1]; j+=gridDim.y*blockDim.y) { 
+            col = csrInd[j];
+            //find which row has least elements (and call it reference row)
+            Ni = csrPtr[row+1] - csrPtr[row];
+            Nj = csrPtr[col+1] - csrPtr[col];
+            ref= (Ni < Nj) ? row : col;
+            cur= (Ni < Nj) ? col : row;
+
+            //compute new sum weights
+            weight_s[j] = work[row] + work[col];
+
+            //compute new intersection weights 
+            //search for the element with the same column index in the reference row
+            for (i=csrPtr[ref]+threadIdx.x+blockIdx.x*blockDim.x; i<csrPtr[ref+1]; i+=gridDim.x*blockDim.x) {
+                match  =-1;           
+                ref_col = csrInd[i];
+                if (weighted) {
+                    ref_val = v[ref_col];
+                }
+                else {
+                    ref_val = 1.0;
+                }
+         
+                //binary search (column indices are sorted within each row)
+                int left = csrPtr[cur]; 
+                int right= csrPtr[cur+1]-1; 
+                while(left <= right){
+                    int middle = (left+right)>>1; 
+                    cur_col= csrInd[middle];
+                    if (cur_col > ref_col) {
+                        right=middle-1;
+                    }
+                    else if (cur_col < ref_col) {
+                        left=middle+1;
+                    }
+                    else {
+                        match = middle; 
+                        break; 
+                    }
+                }            
+
+                //if the element with the same column index in the reference row has been found
+                if (match != -1){
+                    atomicAdd(&weight_i[j],ref_val);
+                }
+            }
+        }
+    }
+}
+
+//Jaccard  weights (*weight)
+template<bool weighted, typename T>
+__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
+jaccard_jw(int n, int e, int *csrPtr, int *csrInd, T *csrVal, T *v, T gamma, T *weight_i, T *weight_s, T *weight_j) {
+    int j;
+    T Wi,Ws,Wu;
+
+    for (j=threadIdx.x+blockIdx.x*blockDim.x; j<e; j+=gridDim.x*blockDim.x) {  
+        Wi =  weight_i[j];
+        Ws =  weight_s[j];
+        Wu =  Ws - Wi;
+        weight_j[j] = (gamma*csrVal[j])* (Wi/Wu); 
+    }
+}
+template<bool weighted, typename T>
+__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
+jaccard_jw(int n, int e, int *csrPtr, int *csrInd, T *v, T *weight_i, T *weight_s, T *weight_j) {
+    int j;
+    T Wi,Ws,Wu;
+
+    for (j=threadIdx.x+blockIdx.x*blockDim.x; j<e; j+=gridDim.x*blockDim.x) {  
+        Wi =  weight_i[j];
+        Ws =  weight_s[j];
+        Wu =  Ws - Wi;
+        weight_j[j] = (Wi/Wu); 
+    }
+}
+
+
+template <bool weighted, typename T> 
+int jaccard(int n, int e, int *csrPtr, int *csrInd, T * csrVal, T *v, T *work, T gamma, T *weight_i, T *weight_s, T *weight_j) {
+    dim3 nthreads, nblocks;
+    int y=4;
+    
+    //setup launch configuration
+    nthreads.x = 32/y; 
+    nthreads.y = y; 
+    nthreads.z = 1; 
+    nblocks.x  = 1; 
+    nblocks.y  = min((n + nthreads.y - 1)/nthreads.y,CUDA_MAX_BLOCKS); 
+    nblocks.z  = 1; 
+    //launch kernel
+    jaccard_row_sum<weighted,T><<<nblocks,nthreads>>>(n,e,csrPtr,csrInd,v,work);
+    fill(e,weight_i,(T)0.0);
+    //setup launch configuration
+    nthreads.x = 32/y;
+    nthreads.y = y;
+    nthreads.z = 8;
+    nblocks.x  = 1;
+    nblocks.y  = 1;
+    nblocks.z  = min((n + nthreads.z - 1)/nthreads.z,CUDA_MAX_BLOCKS); //1; 
+    //launch kernel
+    jaccard_is<weighted,T><<<nblocks,nthreads>>>(n,e,csrPtr,csrInd,v,work,weight_i,weight_s);
+
+    //setup launch configuration
+    nthreads.x = min(e,CUDA_MAX_KERNEL_THREADS); 
+    nthreads.y = 1; 
+    nthreads.z = 1;  
+    nblocks.x  = min((e + nthreads.x - 1)/nthreads.x,CUDA_MAX_BLOCKS); 
+    nblocks.y  = 1; 
+    nblocks.z  = 1;
+    //launch kernel
+    if (csrVal != NULL)
+        jaccard_jw<weighted,T><<<nblocks,nthreads>>>(n,e,csrPtr,csrInd,csrVal,v,gamma,weight_i,weight_s,weight_j);
+    else
+        jaccard_jw<weighted,T><<<nblocks,nthreads>>>(n,e,csrPtr,csrInd,v,weight_i,weight_s,weight_j);
+       
+    return 0;
+}
+
+//template int jaccard<true, half>  ( int n, int e, int *csrPtr, int *csrInd, half *csrVal, half *v, half *work, half gamma, half *weight_i, half *weight_s, half *weight_j);
+//template int jaccard<false, half> ( int n, int e, int *csrPtr, int *csrInd, half *csrVal, half *v, half *work, half gamma, half *weight_i, half *weight_s, half *weight_j);
+
+template int jaccard<true, float>  ( int n, int e, int *csrPtr, int *csrInd, float *csrVal, float *v, float *work, float gamma, float *weight_i, float *weight_s, float *weight_j);
+template int jaccard<false, float> ( int n, int e, int *csrPtr, int *csrInd, float *csrVal, float *v, float *work, float gamma, float *weight_i, float *weight_s, float *weight_j);
+
+template int jaccard<true, double>  (int n, int e, int *csrPtr, int *csrInd, double *csrVal, double *v, double *work, double gamma, double *weight_i, double *weight_s, double *weight_j);
+template int jaccard<false, double> (int n, int e, int *csrPtr, int *csrInd, double *csrVal, double *v, double *work, double gamma, double *weight_i, double *weight_s, double *weight_j);
+
+} //namespace nvga
diff --git a/cpp/nvgraph/cpp/src/kmeans.cu b/cpp/nvgraph/cpp/src/kmeans.cu
new file mode 100644
index 00000000000..8cde394630a
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/kmeans.cu
@@ -0,0 +1,951 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//#ifdef NVGRAPH_PARTITION
+//#ifdef DEBUG
+
+#include "kmeans.hxx"
+
+#include <stdio.h>
+#include <time.h>
+#include <math.h>
+
+#include <cuda.h>
+#include <thrust/device_vector.h>
+#include <thrust/sequence.h>
+#include <thrust/binary_search.h>
+#include <thrust/sort.h>
+#include <thrust/reduce.h>
+#include <thrust/random.h>
+#include <thrust/gather.h>
+
+#include "nvgraph_vector.hxx"
+#include "nvgraph_cublas.hxx"
+#include "atomics.hxx"
+#include "sm_utils.h"
+#include "debug_macros.h"
+
+using namespace nvgraph;
+
+// =========================================================
+// Useful macros
+// =========================================================
+
+#define BLOCK_SIZE 1024
+#define WARP_SIZE  32
+#define BSIZE_DIV_WSIZE (BLOCK_SIZE/WARP_SIZE)
+
+// Get index of matrix entry
+#define IDX(i,j,lda) ((i)+(j)*(lda))
+
+namespace {
+
+  // =========================================================
+  // CUDA kernels
+  // =========================================================
+
+  /// Compute distances between observation vectors and centroids
+  /** Block dimensions should be (warpSize, 1,
+   *  blockSize/warpSize). Ideally, the grid is large enough so there
+   *  are d threads in the x-direction, k threads in the y-direction,
+   *  and n threads in the z-direction.
+   *
+   *  @param n Number of observation vectors.
+   *  @param d Dimension of observation vectors.
+   *  @param k Number of clusters.
+   *  @param obs (Input, d*n entries) Observation matrix. Matrix is
+   *    stored column-major and each column is an observation
+   *    vector. Matrix dimensions are d x n.
+   *  @param centroids (Input, d*k entries) Centroid matrix. Matrix is
+   *    stored column-major and each column is a centroid. Matrix
+   *    dimensions are d x k.
+   *  @param dists (Output, n*k entries) Distance matrix. Matrix is
+   *    stored column-major and the (i,j)-entry is the square of the
+   *    Euclidean distance between the ith observation vector and jth
+   *    centroid. Matrix dimensions are n x k. Entries must be
+   *    initialized to zero.
+   */
+  template <typename IndexType_, typename ValueType_>
+  static __global__
+  void computeDistances(IndexType_ n, IndexType_ d, IndexType_ k,
+      const ValueType_ * __restrict__ obs,
+      const ValueType_ * __restrict__ centroids,
+      ValueType_ * __restrict__ dists) {
+
+    // Loop index
+    IndexType_ i;
+
+    // Block indices
+    IndexType_ bidx;
+    // Global indices
+    IndexType_ gidx, gidy, gidz;
+
+    // Private memory
+    ValueType_ centroid_private, dist_private;
+
+    // Global x-index indicates index of vector entry
+    bidx = blockIdx.x;
+    while(bidx*blockDim.x < d) {
+      gidx = threadIdx.x + bidx*blockDim.x;
+
+      // Global y-index indicates centroid
+      gidy = threadIdx.y + blockIdx.y*blockDim.y;
+      while(gidy < k) {
+
+        // Load centroid coordinate from global memory
+        centroid_private
+          = (gidx < d) ? centroids[IDX(gidx,gidy,d)] : 0;
+
+        // Global z-index indicates observation vector
+        gidz = threadIdx.z + blockIdx.z*blockDim.z;
+        while(gidz < n) {
+
+          // Load observation vector coordinate from global memory
+          dist_private
+            = (gidx < d) ? obs[IDX(gidx,gidz,d)] : 0;
+
+          // Compute contribution of current entry to distance
+          dist_private = centroid_private - dist_private;
+          dist_private = dist_private*dist_private;
+
+          // Perform reduction on warp
+          for(i=WARP_SIZE/2; i>0; i/=2)
+            dist_private += utils::shfl_down(dist_private, i, 2*i);
+        
+          // Write result to global memory
+          if(threadIdx.x == 0)
+            atomicFPAdd(dists+IDX(gidz,gidy,n), dist_private);
+
+          // Move to another observation vector
+          gidz += blockDim.z*gridDim.z;
+        }
+
+        // Move to another centroid
+        gidy += blockDim.y*gridDim.y;
+      }
+      
+      // Move to another vector entry
+      bidx += gridDim.x;
+    }
+  
+  }
+
+  /// Find closest centroid to observation vectors
+  /** Block and grid dimensions should be 1-dimensional. Ideally the
+   *  grid is large enough so there are n threads.
+   *
+   *  @param n Number of observation vectors.
+   *  @param k Number of clusters.
+   *  @param centroids (Input, d*k entries) Centroid matrix. Matrix is
+   *    stored column-major and each column is a centroid. Matrix
+   *    dimensions are d x k.
+   *  @param dists (Input/output, n*k entries) Distance matrix. Matrix
+   *    is stored column-major and the (i,j)-entry is the square of
+   *    the Euclidean distance between the ith observation vector and
+   *    jth centroid. Matrix dimensions are n x k. On exit, the first
+   *    n entries give the square of the Euclidean distance between
+   *    observation vectors and closest centroids.
+   *  @param codes (Output, n entries) Cluster assignments.
+   *  @param clusterSizes (Output, k entries) Number of points in each
+   *    cluster. Entries must be initialized to zero.
+   */
+  template <typename IndexType_, typename ValueType_>
+  static __global__
+  void minDistances(IndexType_ n, IndexType_ k,
+        ValueType_ * __restrict__ dists,
+        IndexType_ * __restrict__ codes,
+        IndexType_ * __restrict__ clusterSizes) {
+
+    // Loop index
+    IndexType_ i, j;
+
+    // Current matrix entry
+    ValueType_ dist_curr;
+
+    // Smallest entry in row
+    ValueType_ dist_min;
+    IndexType_ code_min;
+
+    // Each row in observation matrix is processed by a thread
+    i = threadIdx.x + blockIdx.x*blockDim.x;
+    while(i<n) {
+
+      // Find minimum entry in row
+      code_min = 0;
+      dist_min = dists[IDX(i,0,n)];
+      for(j=1; j<k; ++j) {
+        dist_curr = dists[IDX(i,j,n)];
+        code_min = (dist_curr<dist_min) ? j : code_min;
+        dist_min = (dist_curr<dist_min) ? dist_curr : dist_min;
+      }
+
+      // Transfer result to global memory
+      dists[i] = dist_min;
+      codes[i] = code_min;
+
+      // Increment cluster sizes
+      atomicAdd(clusterSizes+code_min, 1);
+    
+      // Move to another row
+      i += blockDim.x*gridDim.x;
+
+    }
+
+  }
+
+  /// Check if newly computed distances are smaller than old distances
+  /** Block and grid dimensions should be 1-dimensional. Ideally the
+   *  grid is large enough so there are n threads.
+   *
+   *  @param n Number of observation vectors.
+   *  @param dists_old (Input/output, n entries) Distances between
+   *    observation vectors and closest centroids. On exit, entries
+   *    are replaced by entries in 'dists_new' if the corresponding
+   *    observation vectors are closest to the new centroid.
+   *  @param dists_new (Input, n entries) Distance between observation
+   *    vectors and new centroid.
+   *  @param codes_old (Input/output, n entries) Cluster
+   *    assignments. On exit, entries are replaced with 'code_new' if
+   *    the corresponding observation vectors are closest to the new
+   *    centroid.
+   *  @param code_new Index associated with new centroid.
+   */
+  template <typename IndexType_, typename ValueType_>
+  static __global__
+  void minDistances2(IndexType_ n,
+         ValueType_ * __restrict__ dists_old,
+         const ValueType_ * __restrict__ dists_new,
+         IndexType_ * __restrict__ codes_old,
+         IndexType_ code_new) {
+
+    // Loop index
+    IndexType_ i;
+
+    // Distances
+    ValueType_ dist_old_private;
+    ValueType_ dist_new_private;
+
+    // Each row is processed by a thread
+    i = threadIdx.x + blockIdx.x*blockDim.x;
+    while(i<n) {
+
+      // Get old and new distances
+      dist_old_private = dists_old[i];
+      dist_new_private = dists_new[i];
+
+      // Update if new distance is smaller than old distance
+      if(dist_new_private < dist_old_private) {
+        dists_old[i] = dist_new_private;
+        codes_old[i] = code_new;
+      }
+    
+      // Move to another row
+      i += blockDim.x*gridDim.x;
+    }
+
+  }
+
+  /// Compute size of k-means clusters
+  /** Block and grid dimensions should be 1-dimensional. Ideally the
+   *  grid is large enough so there are n threads.
+   *
+   *  @param n Number of observation vectors.
+   *  @param k Number of clusters.
+   *  @param codes (Input, n entries) Cluster assignments.
+   *  @param clusterSizes (Output, k entries) Number of points in each
+   *    cluster. Entries must be initialized to zero.
+   */
+  template <typename IndexType_> static __global__
+  void computeClusterSizes(IndexType_ n, IndexType_ k,
+         const IndexType_ * __restrict__ codes,
+         IndexType_ * __restrict__ clusterSizes) {
+    IndexType_ i = threadIdx.x + blockIdx.x*blockDim.x;
+    while(i<n) {
+      atomicAdd(clusterSizes+codes[i], 1);
+      i += blockDim.x*gridDim.x;
+    }
+  }
+
+  /// Divide rows of centroid matrix by cluster sizes
+  /** Divides the ith column of the sum matrix by the size of the ith
+   *  cluster. If the sum matrix has been initialized so that the ith
+   *  row is the sum of all observation vectors in the ith cluster,
+   *  this kernel produces cluster centroids. The grid and block
+   *  dimensions should be 2-dimensional. Ideally the grid is large
+   *  enough so there are d threads in the x-direction and k threads
+   *  in the y-direction.
+   *
+   *  @param d Dimension of observation vectors.
+   *  @param k Number of clusters.
+   *  @param clusterSizes (Input, k entries) Number of points in each
+   *    cluster.
+   *  @param centroids (Input/output, d*k entries) Sum matrix. Matrix
+   *    is stored column-major and matrix dimensions are d x k. The
+   *    ith column is the sum of all observation vectors in the ith
+   *    cluster. On exit, the matrix is the centroid matrix (each
+   *    column is the mean position of a cluster).
+   */
+  template <typename IndexType_, typename ValueType_>
+  static __global__
+  void divideCentroids(IndexType_ d, IndexType_ k,
+           const IndexType_ * __restrict__ clusterSizes,
+           ValueType_ * __restrict__ centroids) {
+
+
+    // Global indices
+    IndexType_ gidx, gidy;
+
+    // Current cluster size
+    IndexType_ clusterSize_private;
+
+    // Observation vector is determined by global y-index
+    gidy = threadIdx.y + blockIdx.y*blockDim.y;
+    while(gidy < k) {
+    
+      // Get cluster size from global memory
+      clusterSize_private = clusterSizes[gidy];
+
+      // Add vector entries to centroid matrix
+      //   Vector entris are determined by global x-index
+      gidx = threadIdx.x + blockIdx.x*blockDim.x;
+      while(gidx < d) {
+        centroids[IDX(gidx,gidy,d)] /= clusterSize_private;
+        gidx += blockDim.x*gridDim.x;
+      }
+
+      // Move to another centroid
+      gidy += blockDim.y*gridDim.y;
+    }
+
+  }
+
+  // =========================================================
+  // Helper functions
+  // =========================================================
+
+  /// Randomly choose new centroids
+  /** Centroid is randomly chosen with k-means++ algorithm.
+   *
+   *  @param n Number of observation vectors.
+   *  @param d Dimension of observation vectors.
+   *  @param k Number of clusters.
+   *  @param rand Random number drawn uniformly from [0,1).
+   *  @param obs (Input, device memory, d*n entries) Observation
+   *    matrix. Matrix is stored column-major and each column is an
+   *    observation vector. Matrix dimensions are n x d.
+   *  @param dists (Input, device memory, 2*n entries) Workspace. The
+   *    first n entries should be the distance between observation
+   *    vectors and the closest centroid.
+   *  @param centroid (Output, device memory, d entries) Centroid
+   *    coordinates.
+   *  @return Zero if successful. Otherwise non-zero.
+   */
+  template <typename IndexType_, typename ValueType_> static
+  int chooseNewCentroid(IndexType_ n, IndexType_ d, IndexType_ k,
+      ValueType_ rand,
+      const ValueType_ * __restrict__ obs,
+      ValueType_ * __restrict__ dists,
+      ValueType_ * __restrict__ centroid) {
+  
+    using namespace thrust;
+
+    // Cumulative sum of distances
+    ValueType_ * distsCumSum = dists + n;
+    // Residual sum of squares
+    ValueType_ distsSum;
+    // Observation vector that is chosen as new centroid
+    IndexType_ obsIndex;
+
+    // Compute cumulative sum of distances
+    inclusive_scan(device_pointer_cast(dists),
+       device_pointer_cast(dists+n),
+       device_pointer_cast(distsCumSum));
+    cudaCheckError();
+    CHECK_CUDA(cudaMemcpy(&distsSum, distsCumSum+n-1,
+        sizeof(ValueType_),
+        cudaMemcpyDeviceToHost));
+  
+    // Randomly choose observation vector
+    //   Probabilities are proportional to square of distance to closest
+    //   centroid (see k-means++ algorithm)
+    obsIndex = (lower_bound(device_pointer_cast(distsCumSum),
+          device_pointer_cast(distsCumSum+n),
+          distsSum*rand)
+    - device_pointer_cast(distsCumSum));
+    cudaCheckError();
+    obsIndex = max(obsIndex, 0);
+    obsIndex = min(obsIndex, n-1);
+
+    // Record new centroid position
+    CHECK_CUDA(cudaMemcpyAsync(centroid, obs+IDX(0,obsIndex,d),
+             d*sizeof(ValueType_),
+             cudaMemcpyDeviceToDevice));
+
+    return 0;
+
+  }
+
+  /// Choose initial cluster centroids for k-means algorithm
+  /** Centroids are randomly chosen with k-means++ algorithm
+   *
+   *  @param n Number of observation vectors.
+   *  @param d Dimension of observation vectors.
+   *  @param k Number of clusters.
+   *  @param obs (Input, device memory, d*n entries) Observation
+   *    matrix. Matrix is stored column-major and each column is an
+   *    observation vector. Matrix dimensions are d x n.
+   *  @param centroids (Output, device memory, d*k entries) Centroid
+   *    matrix. Matrix is stored column-major and each column is a
+   *    centroid. Matrix dimensions are d x k.
+   *  @param codes (Output, device memory, n entries) Cluster
+   *    assignments.
+   *  @param clusterSizes (Output, device memory, k entries) Number of
+   *    points in each cluster.
+   *  @param dists (Output, device memory, 2*n entries) Workspace. On
+   *    exit, the first n entries give the square of the Euclidean
+   *    distance between observation vectors and the closest centroid.
+   *  @return Zero if successful. Otherwise non-zero.
+   */
+  template <typename IndexType_, typename ValueType_> static
+  int initializeCentroids(IndexType_ n, IndexType_ d, IndexType_ k,
+        const ValueType_ * __restrict__ obs,
+        ValueType_ * __restrict__ centroids,
+        IndexType_ * __restrict__ codes,
+        IndexType_ * __restrict__ clusterSizes,
+        ValueType_ * __restrict__ dists) {
+
+    // -------------------------------------------------------
+    // Variable declarations
+    // -------------------------------------------------------
+
+    // Loop index
+    IndexType_ i;
+
+    // CUDA grid dimensions
+    dim3 blockDim_warp, gridDim_warp, gridDim_block;
+
+    // Random number generator
+    thrust::default_random_engine rng(123456);
+    thrust::uniform_real_distribution<ValueType_> uniformDist(0,1);
+  
+    // -------------------------------------------------------
+    // Implementation
+    // -------------------------------------------------------
+
+    // Initialize grid dimensions
+    blockDim_warp.x = WARP_SIZE;
+    blockDim_warp.y = 1;
+    blockDim_warp.z = BSIZE_DIV_WSIZE;
+    gridDim_warp.x = min((d+WARP_SIZE-1)/WARP_SIZE, 65535);
+    gridDim_warp.y = 1;
+    gridDim_warp.z 
+      = min((n+BSIZE_DIV_WSIZE-1)/BSIZE_DIV_WSIZE, 65535);
+    gridDim_block.x = min((n+BLOCK_SIZE-1)/BLOCK_SIZE, 65535);
+    gridDim_block.y = 1;
+    gridDim_block.z = 1;
+
+    // Assign observation vectors to code 0
+    CHECK_CUDA(cudaMemsetAsync(codes, 0, n*sizeof(IndexType_)));
+
+    // Choose first centroid
+    thrust::fill(thrust::device_pointer_cast(dists),
+     thrust::device_pointer_cast(dists+n), 1);
+    cudaCheckError();
+    if(chooseNewCentroid(n, d, k, uniformDist(rng), obs, dists, centroids))
+      WARNING("error in k-means++ (could not pick centroid)");
+
+    // Compute distances from first centroid
+    CHECK_CUDA(cudaMemsetAsync(dists, 0, n*sizeof(ValueType_)));
+    computeDistances <<< gridDim_warp, blockDim_warp >>>
+      (n, d, 1, obs, centroids, dists);
+    cudaCheckError()
+
+    // Choose remaining centroids
+    for(i=1; i<k; ++i) {
+    
+      // Choose ith centroid
+      if(chooseNewCentroid(n, d, k, uniformDist(rng),obs, dists, centroids+IDX(0,i,d))) 
+        WARNING("error in k-means++ (could not pick centroid)");
+
+      // Compute distances from ith centroid
+      CHECK_CUDA(cudaMemsetAsync(dists+n, 0, n*sizeof(ValueType_)));
+      computeDistances <<< gridDim_warp, blockDim_warp >>>
+        (n, d, 1, obs, centroids+IDX(0,i,d), dists+n);
+      cudaCheckError();
+
+      // Recompute minimum distances
+      minDistances2 <<< gridDim_block, BLOCK_SIZE >>>
+        (n, dists, dists+n, codes, i);
+      cudaCheckError();
+
+    }
+
+    // Compute cluster sizes
+    CHECK_CUDA(cudaMemsetAsync(clusterSizes, 0, k*sizeof(IndexType_)));
+    computeClusterSizes <<< gridDim_block, BLOCK_SIZE >>>
+      (n, k, codes, clusterSizes);
+    cudaCheckError();
+
+    return 0;
+
+  }
+
+  /// Find cluster centroids closest to observation vectors
+  /** Distance is measured with Euclidean norm.
+   *
+   *  @param n Number of observation vectors.
+   *  @param d Dimension of observation vectors.
+   *  @param k Number of clusters.
+   *  @param obs (Input, device memory, d*n entries) Observation
+   *    matrix. Matrix is stored column-major and each column is an
+   *    observation vector. Matrix dimensions are d x n.
+   *  @param centroids (Input, device memory, d*k entries) Centroid
+   *    matrix. Matrix is stored column-major and each column is a
+   *    centroid. Matrix dimensions are d x k.
+   *  @param dists (Output, device memory, n*k entries) Workspace. On
+   *    exit, the first n entries give the square of the Euclidean
+   *    distance between observation vectors and the closest centroid.
+   *  @param codes (Output, device memory, n entries) Cluster
+   *    assignments.
+   *  @param clusterSizes (Output, device memory, k entries) Number of
+   *    points in each cluster.
+   *  @param residual_host (Output, host memory, 1 entry) Residual sum
+   *    of squares of assignment.
+   *  @return Zero if successful. Otherwise non-zero.
+   */
+  template <typename IndexType_, typename ValueType_> static
+  int assignCentroids(IndexType_ n, IndexType_ d, IndexType_ k,
+          const ValueType_ * __restrict__ obs,
+          const ValueType_ * __restrict__ centroids,
+          ValueType_ * __restrict__ dists,
+          IndexType_ * __restrict__ codes,
+          IndexType_ * __restrict__ clusterSizes,
+          ValueType_ * residual_host) {
+
+    // CUDA grid dimensions
+    dim3 blockDim, gridDim;
+
+    // Compute distance between centroids and observation vectors
+    CHECK_CUDA(cudaMemsetAsync(dists, 0, n*k*sizeof(ValueType_)));
+    blockDim.x = WARP_SIZE;
+    blockDim.y = 1;
+    blockDim.z = BLOCK_SIZE/WARP_SIZE;
+    gridDim.x  = min((d+WARP_SIZE-1)/WARP_SIZE, 65535);
+    gridDim.y  = min(k, 65535);
+    gridDim.z  = min((n+BSIZE_DIV_WSIZE-1)/BSIZE_DIV_WSIZE, 65535);
+    computeDistances <<< gridDim, blockDim >>> (n, d, k,
+            obs, centroids,
+            dists);
+    cudaCheckError();
+
+    // Find centroid closest to each observation vector
+    CHECK_CUDA(cudaMemsetAsync(clusterSizes,0,k*sizeof(IndexType_)));
+    blockDim.x = BLOCK_SIZE;
+    blockDim.y = 1;
+    blockDim.z = 1;
+    gridDim.x  = min((n+BLOCK_SIZE-1)/BLOCK_SIZE, 65535);
+    gridDim.y  = 1;
+    gridDim.z  = 1;
+    minDistances <<< gridDim, blockDim >>> (n, k, dists, codes,
+              clusterSizes);
+    cudaCheckError();
+
+    // Compute residual sum of squares
+    *residual_host
+      = thrust::reduce(thrust::device_pointer_cast(dists),
+           thrust::device_pointer_cast(dists+n));
+
+    return 0;
+
+  }
+
+  /// Update cluster centroids for k-means algorithm
+  /** All clusters are assumed to be non-empty.
+   *
+   *  @param n Number of observation vectors.
+   *  @param d Dimension of observation vectors.
+   *  @param k Number of clusters.
+   *  @param obs (Input, device memory, d*n entries) Observation
+   *    matrix. Matrix is stored column-major and each column is an
+   *    observation vector. Matrix dimensions are d x n.
+   *  @param codes (Input, device memory, n entries) Cluster
+   *    assignments.
+   *  @param clusterSizes (Input, device memory, k entries) Number of
+   *    points in each cluster.
+   *  @param centroids (Output, device memory, d*k entries) Centroid
+   *    matrix. Matrix is stored column-major and each column is a
+   *    centroid. Matrix dimensions are d x k.
+   *  @param work (Output, device memory, n*d entries) Workspace.
+   *  @param work_int (Output, device memory, 2*d*n entries)
+   *    Workspace.
+   *  @return Zero if successful. Otherwise non-zero.
+   */
+  template <typename IndexType_, typename ValueType_> static
+  int updateCentroids(IndexType_ n, IndexType_ d, IndexType_ k,
+          const ValueType_ * __restrict__ obs,
+          const IndexType_ * __restrict__ codes,
+          const IndexType_ * __restrict__ clusterSizes,
+          ValueType_ * __restrict__ centroids,
+          ValueType_ * __restrict__ work,
+          IndexType_ * __restrict__ work_int) {
+
+    using namespace thrust;
+
+    // -------------------------------------------------------
+    // Variable declarations
+    // -------------------------------------------------------
+
+    // Useful constants
+    const ValueType_ one  = 1;
+    const ValueType_ zero = 0;
+
+    // CUDA grid dimensions
+    dim3 blockDim, gridDim;
+
+    // Device memory
+    device_ptr<ValueType_> obs_copy(work);
+    device_ptr<IndexType_> codes_copy(work_int);
+    device_ptr<IndexType_> rows(work_int+d*n);
+
+    // Take transpose of observation matrix
+    Cublas::geam(true, false, n, d,
+     &one, obs, d, &zero, (ValueType_*) NULL, n,
+     raw_pointer_cast(obs_copy), n);
+
+    // Cluster assigned to each observation matrix entry
+    sequence(rows, rows+d*n);
+    cudaCheckError();
+    transform(rows, rows+d*n, make_constant_iterator<IndexType_>(n),
+        rows, modulus<IndexType_>());
+    cudaCheckError();
+    gather(rows, rows+d*n, device_pointer_cast(codes), codes_copy);
+    cudaCheckError();
+
+    // Row associated with each observation matrix entry
+    sequence(rows, rows+d*n);
+    cudaCheckError();
+    transform(rows, rows+d*n, make_constant_iterator<IndexType_>(n),
+        rows, divides<IndexType_>());
+    cudaCheckError();
+
+    // Sort and reduce to add observation vectors in same cluster
+    stable_sort_by_key(codes_copy, codes_copy+d*n,
+           make_zip_iterator(make_tuple(obs_copy, rows)));
+    cudaCheckError();
+    reduce_by_key(rows, rows+d*n, obs_copy,
+      codes_copy, // Output to codes_copy is ignored
+      device_pointer_cast(centroids));
+    cudaCheckError();
+
+    // Divide sums by cluster size to get centroid matrix
+    blockDim.x = WARP_SIZE;
+    blockDim.y = BLOCK_SIZE/WARP_SIZE;
+    blockDim.z = 1;
+    gridDim.x  = min((d+WARP_SIZE-1)/WARP_SIZE, 65535);
+    gridDim.y  = min((k+BSIZE_DIV_WSIZE-1)/BSIZE_DIV_WSIZE, 65535);
+    gridDim.z  = 1;
+    divideCentroids <<< gridDim, blockDim >>> (d, k, clusterSizes,
+                 centroids);
+    cudaCheckError();
+
+    return 0;
+
+  }
+
+}
+
+namespace nvgraph {
+
+  // =========================================================
+  // k-means algorithm
+  // =========================================================
+
+  /// Find clusters with k-means algorithm
+  /** Initial centroids are chosen with k-means++ algorithm. Empty
+   *  clusters are reinitialized by choosing new centroids with
+   *  k-means++ algorithm.
+   *
+   *  @param n Number of observation vectors.
+   *  @param d Dimension of observation vectors.
+   *  @param k Number of clusters.
+   *  @param tol Tolerance for convergence. k-means stops when the
+   *    change in residual divided by n is less than tol.
+   *  @param maxiter Maximum number of k-means iterations.
+   *  @param obs (Input, device memory, d*n entries) Observation
+   *    matrix. Matrix is stored column-major and each column is an
+   *    observation vector. Matrix dimensions are d x n.
+   *  @param codes (Output, device memory, n entries) Cluster
+   *    assignments.
+   *  @param clusterSizes (Output, device memory, k entries) Number of
+   *    points in each cluster.
+   *  @param centroids (Output, device memory, d*k entries) Centroid
+   *    matrix. Matrix is stored column-major and each column is a
+   *    centroid. Matrix dimensions are d x k.
+   *  @param work (Output, device memory, n*max(k,d) entries)
+   *    Workspace.
+   *  @param work_int (Output, device memory, 2*d*n entries)
+   *    Workspace.
+   *  @param residual_host (Output, host memory, 1 entry) Residual sum
+   *    of squares (sum of squares of distances between observation
+   *    vectors and centroids).
+   *  @param iters_host (Output, host memory, 1 entry) Number of
+   *    k-means iterations.
+   *  @return NVGRAPH error flag.
+   */
+  template <typename IndexType_, typename ValueType_>
+  NVGRAPH_ERROR kmeans(IndexType_ n, IndexType_ d, IndexType_ k,
+        ValueType_ tol, IndexType_ maxiter,
+        const ValueType_ * __restrict__ obs,
+        IndexType_ * __restrict__ codes,
+        IndexType_ * __restrict__ clusterSizes,
+        ValueType_ * __restrict__ centroids,
+        ValueType_ * __restrict__ work,
+        IndexType_ * __restrict__ work_int,
+        ValueType_ * residual_host,
+        IndexType_ * iters_host) {
+  
+    // -------------------------------------------------------
+    // Variable declarations
+    // -------------------------------------------------------
+
+    // Current iteration
+    IndexType_ iter;
+
+    // Residual sum of squares at previous iteration
+    ValueType_ residualPrev = 0;
+
+    // Random number generator
+    thrust::default_random_engine rng(123456);
+    thrust::uniform_real_distribution<ValueType_> uniformDist(0,1);
+
+    // -------------------------------------------------------
+    // Initialization
+    // -------------------------------------------------------
+
+    // Check that parameters are valid
+    if(n < 1) {
+      WARNING("invalid parameter (n<1)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(d < 1) {
+      WARNING("invalid parameter (d<1)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(k < 1) {
+      WARNING("invalid parameter (k<1)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(tol < 0) {
+      WARNING("invalid parameter (tol<0)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(maxiter < 0) {
+      WARNING("invalid parameter (maxiter<0)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+
+    // Trivial cases
+    if(k == 1) {
+      CHECK_CUDA(cudaMemsetAsync(codes, 0, n*sizeof(IndexType_)));
+      CHECK_CUDA(cudaMemcpyAsync(clusterSizes, &n, sizeof(IndexType_),
+         cudaMemcpyHostToDevice));
+      if(updateCentroids(n, d, k, obs, codes,
+           clusterSizes, centroids,
+           work, work_int)) 
+        WARNING("could not compute k-means centroids");
+      dim3 blockDim, gridDim;
+      blockDim.x = WARP_SIZE;
+      blockDim.y = 1;
+      blockDim.z = BLOCK_SIZE/WARP_SIZE;
+      gridDim.x = min((d+WARP_SIZE-1)/WARP_SIZE, 65535);
+      gridDim.y = 1;
+      gridDim.z = min((n+BLOCK_SIZE/WARP_SIZE-1)/(BLOCK_SIZE/WARP_SIZE), 65535);
+      CHECK_CUDA(cudaMemsetAsync(work, 0, n*k*sizeof(ValueType_)));
+      computeDistances <<< gridDim, blockDim >>> (n, d, 1,
+              obs,
+              centroids,
+              work);
+      cudaCheckError();
+      *residual_host = thrust::reduce(thrust::device_pointer_cast(work), 
+        thrust::device_pointer_cast(work+n));
+      cudaCheckError();
+      return NVGRAPH_OK;
+    }
+    if(n <= k) {
+      thrust::sequence(thrust::device_pointer_cast(codes),
+           thrust::device_pointer_cast(codes+n));
+      cudaCheckError();
+      thrust::fill_n(thrust::device_pointer_cast(clusterSizes), n, 1);
+      cudaCheckError();
+
+      if(n < k)
+        CHECK_CUDA(cudaMemsetAsync(clusterSizes+n, 0, (k-n)*sizeof(IndexType_)));
+      CHECK_CUDA(cudaMemcpyAsync(centroids, obs, d*n*sizeof(ValueType_),
+        cudaMemcpyDeviceToDevice));
+      *residual_host = 0;
+      return NVGRAPH_OK;
+    }
+
+    // Initialize cuBLAS
+    Cublas::set_pointer_mode_host();
+
+    // -------------------------------------------------------
+    // k-means++ algorithm
+    // -------------------------------------------------------
+
+    // Choose initial cluster centroids
+    if(initializeCentroids(n, d, k, obs, centroids, codes,
+             clusterSizes, work)) 
+      WARNING("could not initialize k-means centroids");
+
+    // Apply k-means iteration until convergence
+    for(iter=0; iter<maxiter; ++iter) {
+
+      // Update cluster centroids
+      if(updateCentroids(n, d, k, obs, codes, 
+           clusterSizes, centroids,
+           work, work_int)) WARNING("could not update k-means centroids");
+
+      // Determine centroid closest to each observation
+      residualPrev = *residual_host;
+      if(assignCentroids(n, d, k, obs, centroids, work,
+           codes, clusterSizes, residual_host))
+       WARNING("could not assign observation vectors to k-means clusters");
+
+      // Reinitialize empty clusters with new centroids
+      IndexType_ emptyCentroid = (thrust::find(thrust::device_pointer_cast(clusterSizes), 
+        thrust::device_pointer_cast(clusterSizes+k), 0) - thrust::device_pointer_cast(clusterSizes));
+      while(emptyCentroid < k) {
+        if(chooseNewCentroid(n, d, k, uniformDist(rng), obs, work, centroids+IDX(0,emptyCentroid,d)))
+          WARNING("could not replace empty centroid");
+        if(assignCentroids(n, d, k, obs, centroids, work, codes, clusterSizes, residual_host))
+          WARNING("could not assign observation vectors to k-means clusters");
+        emptyCentroid = (thrust::find(thrust::device_pointer_cast(clusterSizes), 
+            thrust::device_pointer_cast(clusterSizes+k), 0) - thrust::device_pointer_cast(clusterSizes));
+        cudaCheckError();
+      }
+
+      // Check for convergence
+      if(fabs(residualPrev-(*residual_host))/n < tol) {
+        ++iter;
+        break;
+      }
+
+    }
+
+    // Warning if k-means has failed to converge
+    if(fabs(residualPrev-(*residual_host))/n >= tol)
+      WARNING("k-means failed to converge");
+
+    *iters_host = iter;
+    return NVGRAPH_OK;
+
+  }
+
+  /// Find clusters with k-means algorithm
+  /** Initial centroids are chosen with k-means++ algorithm. Empty
+   *  clusters are reinitialized by choosing new centroids with
+   *  k-means++ algorithm.
+   *
+   *  CNMEM must be initialized before calling this function.
+   *
+   *  @param n Number of observation vectors.
+   *  @param d Dimension of observation vectors.
+   *  @param k Number of clusters.
+   *  @param tol Tolerance for convergence. k-means stops when the
+   *    change in residual divided by n is less than tol.
+   *  @param maxiter Maximum number of k-means iterations.
+   *  @param obs (Input, device memory, d*n entries) Observation
+   *    matrix. Matrix is stored column-major and each column is an
+   *    observation vector. Matrix dimensions are d x n.
+   *  @param codes (Output, device memory, n entries) Cluster
+   *    assignments.
+   *  @param residual On exit, residual sum of squares (sum of squares
+   *    of distances between observation vectors and centroids).
+   *  @param On exit, number of k-means iterations.
+   *  @return NVGRAPH error flag
+   */
+  template <typename IndexType_, typename ValueType_>
+  NVGRAPH_ERROR kmeans(IndexType_ n, IndexType_ d, IndexType_ k,
+        ValueType_ tol, IndexType_ maxiter,
+        const ValueType_ * __restrict__ obs,
+        IndexType_ * __restrict__ codes,
+        ValueType_ & residual,
+        IndexType_ & iters) {
+
+    // Check that parameters are valid
+    if(n < 1) {
+      WARNING("invalid parameter (n<1)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(d < 1) {
+      WARNING("invalid parameter (d<1)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(k < 1) {
+      WARNING("invalid parameter (k<1)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(tol < 0) {
+      WARNING("invalid parameter (tol<0)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(maxiter < 0) {
+      WARNING("invalid parameter (maxiter<0)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+
+    // Allocate memory
+    // TODO: handle non-zero CUDA streams
+    cudaStream_t stream = 0;
+    Vector<IndexType_> clusterSizes(k, stream);
+    Vector<ValueType_> centroids(d*k, stream);
+    Vector<ValueType_> work(n*max(k,d), stream);
+    Vector<IndexType_> work_int(2*d*n, stream);
+    
+    // Perform k-means
+    return kmeans<IndexType_,ValueType_>(n, d, k, tol, maxiter,
+           obs, codes, 
+           clusterSizes.raw(),
+           centroids.raw(),
+           work.raw(), work_int.raw(),
+           &residual, &iters);
+    
+  }
+
+
+  // =========================================================
+  // Explicit instantiations
+  // =========================================================
+
+  template
+  NVGRAPH_ERROR kmeans<int, float>(int n, int d, int k,
+        float tol, int maxiter,
+        const float * __restrict__ obs,
+        int * __restrict__ codes,
+        float & residual,
+        int & iters);
+  template
+  NVGRAPH_ERROR kmeans<int, double>(int n, int d, int k,
+         double tol, int maxiter,
+         const double * __restrict__ obs,
+         int * __restrict__ codes,
+         double & residual,
+         int & iters);
+}
+//#endif //NVGRAPH_PARTITION
+//#endif //debug
+
diff --git a/cpp/nvgraph/cpp/src/lanczos.cu b/cpp/nvgraph/cpp/src/lanczos.cu
new file mode 100644
index 00000000000..fbf4a57b69c
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/lanczos.cu
@@ -0,0 +1,1561 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//#ifdef NVGRAPH_PARTITION
+
+#define _USE_MATH_DEFINES
+#include <math.h>
+#include "lanczos.hxx"
+
+#include <stdio.h>
+#include <time.h>
+
+#include <cuda.h>
+
+#define USE_CURAND 1
+
+#ifdef USE_CURAND
+  #include <curand.h>
+#endif
+
+#include "nvgraph_error.hxx"
+#include "nvgraph_vector.hxx"
+#include "nvgraph_vector_kernels.hxx"
+#include "nvgraph_cublas.hxx"
+#include "nvgraph_lapack.hxx"
+#include "debug_macros.h"
+// =========================================================
+// Useful macros
+// =========================================================
+
+// Get index of matrix entry
+#define IDX(i,j,lda) ((i)+(j)*(lda))
+
+// =========================================================
+// Macros and functions for cuRAND
+// =========================================================
+//#ifdef USE_CURAND
+//namespace {
+//
+//  /// Get message string from cuRAND status code
+//  //static
+//  //const char* curandGetErrorString(curandStatus_t e) {
+//  //  switch(e) {
+//  //  case CURAND_STATUS_SUCCESS:
+//  //    return "CURAND_STATUS_SUCCESS";
+//  //  case CURAND_STATUS_VERSION_MISMATCH:
+//  //    return "CURAND_STATUS_VERSION_MISMATCH";
+//  //  case CURAND_STATUS_NOT_INITIALIZED:
+//  //    return "CURAND_STATUS_NOT_INITIALIZED";
+//  //  case CURAND_STATUS_ALLOCATION_FAILED:
+//  //    return "CURAND_STATUS_ALLOCATION_FAILED";
+//  //  case CURAND_STATUS_TYPE_ERROR:
+//  //    return "CURAND_STATUS_TYPE_ERROR";
+//  //  case CURAND_STATUS_OUT_OF_RANGE:
+//  //    return "CURAND_STATUS_OUT_OF_RANGE";
+//  //  case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+//  //    return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+//  //  case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+//  //    return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+//  //  case CURAND_STATUS_LAUNCH_FAILURE:
+//  //    return "CURAND_STATUS_LAUNCH_FAILURE";
+//  //  case CURAND_STATUS_PREEXISTING_FAILURE:
+//  //    return "CURAND_STATUS_PREEXISTING_FAILURE";
+//  //  case CURAND_STATUS_INITIALIZATION_FAILED:
+//  //    return "CURAND_STATUS_INITIALIZATION_FAILED";
+//  //  case CURAND_STATUS_ARCH_MISMATCH:
+//  //    return "CURAND_STATUS_ARCH_MISMATCH";
+//  //  case CURAND_STATUS_INTERNAL_ERROR:
+//  //    return "CURAND_STATUS_INTERNAL_ERROR";
+//  //  default:
+//  //    return "unknown cuRAND error";
+//  //  }
+//  //}
+//
+//  // curandGeneratorNormalX
+//  inline static 
+//  curandStatus_t
+//  curandGenerateNormalX(curandGenerator_t generator,
+//      float * outputPtr, size_t n,
+//      float mean, float stddev) {
+//    return curandGenerateNormal(generator, outputPtr, n, mean, stddev);
+//  }
+//  inline static
+//  curandStatus_t
+//  curandGenerateNormalX(curandGenerator_t generator,
+//      double * outputPtr, size_t n,
+//      double mean, double stddev) {
+//    return curandGenerateNormalDouble(generator, outputPtr,
+//              n, mean, stddev);
+//  }
+//
+//}
+//#endif
+
+namespace nvgraph {
+
+  namespace {
+
+    // =========================================================
+    // Helper functions
+    // =========================================================
+
+    /// Perform Lanczos iteration
+    /** Lanczos iteration is performed on a shifted matrix A+shift*I.
+     *
+     *  @param A Matrix.
+     *  @param iter Pointer to current Lanczos iteration. On exit, the
+     *    variable is set equal to the final Lanczos iteration.
+     *  @param maxIter Maximum Lanczos iteration. This function will
+     *    perform a maximum of maxIter-*iter iterations.
+     *  @param shift Matrix shift.
+     *  @param tol Convergence tolerance. Lanczos iteration will
+     *    terminate when the residual norm (i.e. entry in beta_host) is
+     *    less than tol.
+     *  @param reorthogonalize Whether to reorthogonalize Lanczos
+     *    vectors.
+     *  @param alpha_host (Output, host memory, maxIter entries)
+     *    Diagonal entries of Lanczos system.
+     *  @param beta_host (Output, host memory, maxIter entries)
+     *    Off-diagonal entries of Lanczos system.
+     *  @param lanczosVecs_dev (Input/output, device memory,
+     *    n*(maxIter+1) entries) Lanczos vectors. Vectors are stored as
+     *    columns of a column-major matrix with dimensions
+     *    n x (maxIter+1).
+     *  @param work_dev (Output, device memory, maxIter entries)
+     *    Workspace. Not needed if full reorthogonalization is disabled.
+     *  @return Zero if successful. Otherwise non-zero.
+     */
+    template <typename IndexType_, typename ValueType_> static
+    int performLanczosIteration(const Matrix<IndexType_, ValueType_> * A,
+        IndexType_ * iter,
+        IndexType_ maxIter,
+        ValueType_ shift,
+        ValueType_ tol,
+        bool reorthogonalize,
+        ValueType_ * __restrict__ alpha_host,
+        ValueType_ * __restrict__ beta_host,
+        ValueType_ * __restrict__ lanczosVecs_dev,
+        ValueType_ * __restrict__ work_dev) {
+
+      // -------------------------------------------------------
+      // Variable declaration
+      // -------------------------------------------------------
+
+      // Useful variables
+      const ValueType_ one    = 1;
+      const ValueType_ negOne = -1;
+      const ValueType_ zero   = 0;
+
+      IndexType_ n = A->n;
+
+      // -------------------------------------------------------
+      // Compute second Lanczos vector
+      // -------------------------------------------------------
+      if(*iter<=0) {
+  *iter = 1;
+
+  // Apply matrix
+  if(shift != 0)
+    CHECK_CUDA(cudaMemcpyAsync(lanczosVecs_dev+n, lanczosVecs_dev,
+             n*sizeof(ValueType_),
+             cudaMemcpyDeviceToDevice));
+  A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev+n);
+
+  // Orthogonalize Lanczos vector
+  Cublas::dot(n,
+        lanczosVecs_dev, 1,
+        lanczosVecs_dev+IDX(0,1,n), 1,
+        alpha_host);
+  Cublas::axpy(n, -alpha_host[0],
+         lanczosVecs_dev, 1,
+         lanczosVecs_dev+IDX(0,1,n), 1);
+  beta_host[0] = Cublas::nrm2(n, lanczosVecs_dev+IDX(0,1,n), 1);
+
+  // Check if Lanczos has converged
+  if(beta_host[0] <= tol)
+    return 0;
+
+  // Normalize Lanczos vector
+  Cublas::scal(n, 1/beta_host[0], lanczosVecs_dev+IDX(0,1,n), 1);
+
+      }
+
+      // -------------------------------------------------------
+      // Compute remaining Lanczos vectors
+      // -------------------------------------------------------
+
+      while(*iter<maxIter) {
+  ++(*iter);
+    
+  // Apply matrix
+  if(shift != 0)
+    CHECK_CUDA(cudaMemcpyAsync(lanczosVecs_dev+(*iter)*n,
+             lanczosVecs_dev+(*iter-1)*n,
+             n*sizeof(ValueType_),
+             cudaMemcpyDeviceToDevice));
+  A->mv(1, lanczosVecs_dev+IDX(0,*iter-1,n),
+       shift, lanczosVecs_dev+IDX(0,*iter,n));
+
+  // Full reorthogonalization
+  //   "Twice is enough" algorithm per Kahan and Parlett
+  if(reorthogonalize) {
+    Cublas::gemv(true, n, *iter,
+           &one, lanczosVecs_dev, n,
+           lanczosVecs_dev+IDX(0,*iter,n), 1,
+           &zero, work_dev, 1);
+    Cublas::gemv(false, n, *iter,
+           &negOne, lanczosVecs_dev, n, work_dev, 1,
+           &one, lanczosVecs_dev+IDX(0,*iter,n), 1);
+    CHECK_CUDA(cudaMemcpyAsync(alpha_host+(*iter-1), work_dev+(*iter-1), 
+             sizeof(ValueType_), cudaMemcpyDeviceToHost));
+    Cublas::gemv(true, n, *iter,
+           &one, lanczosVecs_dev, n,
+           lanczosVecs_dev+IDX(0,*iter,n), 1,
+           &zero, work_dev, 1);
+    Cublas::gemv(false, n, *iter,
+           &negOne, lanczosVecs_dev, n, work_dev, 1,
+           &one, lanczosVecs_dev+IDX(0,*iter,n), 1);
+  }
+
+
+  // Orthogonalization with 3-term recurrence relation
+  else {
+    Cublas::dot(n, lanczosVecs_dev+IDX(0,*iter-1,n), 1,
+          lanczosVecs_dev+IDX(0,*iter,n), 1,
+          alpha_host+(*iter-1));
+    Cublas::axpy(n, -alpha_host[*iter-1],
+           lanczosVecs_dev+IDX(0,*iter-1,n), 1,
+           lanczosVecs_dev+IDX(0,*iter,n), 1);
+    Cublas::axpy(n, -beta_host[*iter-2],
+           lanczosVecs_dev+IDX(0,*iter-2,n), 1,
+           lanczosVecs_dev+IDX(0,*iter,n), 1);
+  }
+
+  // Compute residual
+  beta_host[*iter-1] = Cublas::nrm2(n, lanczosVecs_dev+IDX(0,*iter,n), 1);
+
+  // Check if Lanczos has converged
+  if(beta_host[*iter-1] <= tol)
+    break;
+  // Normalize Lanczos vector
+  Cublas::scal(n, 1/beta_host[*iter-1],
+         lanczosVecs_dev+IDX(0,*iter,n), 1);
+
+      }
+
+      CHECK_CUDA(cudaDeviceSynchronize());
+      
+      return 0;
+
+    }
+
+    /// Find Householder transform for 3-dimensional system
+    /** Given an input vector v=[x,y,z]', this function finds a
+     *  Householder transform P such that P*v is a multiple of
+     *  e_1=[1,0,0]'. The input vector v is overwritten with the
+     *  Householder vector such that P=I-2*v*v'.
+     *
+     *  @param v (Input/output, host memory, 3 entries) Input
+     *    3-dimensional vector. On exit, the vector is set to the
+     *    Householder vector.
+     *  @param Pv (Output, host memory, 1 entry) First entry of P*v
+     *    (here v is the input vector). Either equal to ||v||_2 or
+     *    -||v||_2.
+     *  @param P (Output, host memory, 9 entries) Householder transform
+     *    matrix. Matrix dimensions are 3 x 3.
+     */
+    template <typename IndexType_, typename ValueType_> static
+    void findHouseholder3(ValueType_ * v, ValueType_ * Pv,
+        ValueType_ * P) {
+  
+      // Compute norm of vector
+      *Pv = std::sqrt(v[0]*v[0]+v[1]*v[1]+v[2]*v[2]);
+
+      // Choose whether to reflect to e_1 or -e_1
+      //   This choice avoids catastrophic cancellation
+      if(v[0] >= 0)
+  *Pv = -(*Pv);
+      v[0] -= *Pv;
+
+      // Normalize Householder vector
+      ValueType_ normHouseholder = std::sqrt(v[0]*v[0]+v[1]*v[1]+v[2]*v[2]);
+      if(normHouseholder != 0) {
+  v[0] /= normHouseholder;
+  v[1] /= normHouseholder;
+  v[2] /= normHouseholder;
+      }
+      else {
+  v[0] = 0;
+  v[1] = 0;
+  v[2] = 0;
+      }
+
+      // Construct Householder matrix
+      IndexType_ i, j;
+      for(j=0; j<3; ++j)
+  for(i=0; i<3; ++i)
+    P[IDX(i,j,3)] = -2*v[i]*v[j];
+      for(i=0; i<3; ++i)
+  P[IDX(i,i,3)] += 1;
+
+    }
+
+    /// Apply 3-dimensional Householder transform to 4 x 4 matrix
+    /** The Householder transform is pre-applied to the top three rows
+     *  of the matrix and post-applied to the left three columns. The
+     *  4 x 4 matrix is intended to contain the bulge that is produced
+     *  in the Francis QR algorithm.
+     *
+     *  @param v (Input, host memory, 3 entries) Householder vector.
+     *  @param A (Input/output, host memory, 16 entries) 4 x 4 matrix.
+     */
+    template <typename IndexType_, typename ValueType_> static
+    void applyHouseholder3(const ValueType_ * v, ValueType_ * A) {
+
+      // Loop indices
+      IndexType_ i, j;
+      // Dot product between Householder vector and matrix row/column
+      ValueType_ vDotA;
+
+      // Pre-apply Householder transform
+      for(j=0; j<4; ++j) {
+  vDotA = 0;
+  for(i=0; i<3; ++i)
+    vDotA += v[i]*A[IDX(i,j,4)];
+  for(i=0; i<3; ++i)
+    A[IDX(i,j,4)] -= 2*v[i]*vDotA;
+      }
+
+      // Post-apply Householder transform
+      for(i=0; i<4; ++i) {
+  vDotA = 0;
+  for(j=0; j<3; ++j)
+    vDotA += A[IDX(i,j,4)]*v[j];
+  for(j=0; j<3; ++j)
+    A[IDX(i,j,4)] -= 2*vDotA*v[j];
+      }
+
+    }
+
+    /// Perform one step of Francis QR algorithm
+    /** Equivalent to two steps of the classical QR algorithm on a
+     *  tridiagonal matrix.
+     *
+     *  @param n Matrix dimension.
+     *  @param shift1 QR algorithm shift.
+     *  @param shift2 QR algorithm shift.
+     *  @param alpha (Input/output, host memory, n entries) Diagonal
+     *    entries of tridiagonal matrix.
+     *  @param beta (Input/output, host memory, n-1 entries)
+     *    Off-diagonal entries of tridiagonal matrix.
+     *  @param V (Input/output, host memory, n*n entries) Orthonormal
+     *    transforms from previous steps of QR algorithm. Matrix
+     *    dimensions are n x n. On exit, the orthonormal transform from
+     *    this Francis QR step is post-applied to the matrix.
+     *  @param work (Output, host memory, 3*n entries) Workspace.
+     *  @return Zero if successful. Otherwise non-zero.
+     */
+    template <typename IndexType_, typename ValueType_> static
+    int francisQRIteration(IndexType_ n,
+         ValueType_ shift1, ValueType_ shift2,
+         ValueType_ * alpha, ValueType_ * beta,
+         ValueType_ * V, ValueType_ * work) {
+
+      // -------------------------------------------------------
+      // Variable declaration
+      // -------------------------------------------------------
+
+      // Temporary storage of 4x4 bulge and Householder vector
+      ValueType_ bulge[16];
+
+      // Householder vector
+      ValueType_ householder[3];
+      // Householder matrix
+      ValueType_ householderMatrix[3*3];
+
+      // Shifts are roots of the polynomial p(x)=x^2+b*x+c
+      ValueType_ b = -shift1 - shift2;
+      ValueType_ c = shift1*shift2;
+
+      // Loop indices
+      IndexType_ i, j, pos;
+      // Temporary variable
+      ValueType_ temp;
+
+      // -------------------------------------------------------
+      // Implementation
+      // -------------------------------------------------------
+
+      // Compute initial Householder transform
+      householder[0] = alpha[0]*alpha[0] + beta[0]*beta[0] + b*alpha[0] + c;
+      householder[1] = beta[0]*(alpha[0]+alpha[1]+b);
+      householder[2] = beta[0]*beta[1];
+      findHouseholder3<IndexType_,ValueType_>(householder, &temp,
+                householderMatrix);
+
+      // Apply initial Householder transform to create bulge
+      memset(bulge, 0, 16*sizeof(ValueType_));
+      for(i=0; i<4; ++i)
+  bulge[IDX(i,i,4)] = alpha[i];
+      for(i=0; i<3; ++i) {
+  bulge[IDX(i+1,i,4)] = beta[i];
+  bulge[IDX(i,i+1,4)] = beta[i];
+      }
+      applyHouseholder3<IndexType_,ValueType_>(householder, bulge);
+      Lapack<ValueType_>::gemm(false, false, n, 3, 3,
+             1, V, n, householderMatrix, 3,
+             0, work, n);
+      memcpy(V, work, 3*n*sizeof(ValueType_));
+
+      // Chase bulge to bottom-right of matrix with Householder transforms
+      for(pos=0; pos<n-4; ++pos) {
+
+  // Move to next position
+  alpha[pos]     = bulge[IDX(0,0,4)];
+  householder[0] = bulge[IDX(1,0,4)];
+  householder[1] = bulge[IDX(2,0,4)];
+  householder[2] = bulge[IDX(3,0,4)];
+  for(j=0; j<3; ++j)
+    for(i=0; i<3; ++i)
+      bulge[IDX(i,j,4)] = bulge[IDX(i+1,j+1,4)];
+  bulge[IDX(3,0,4)] = 0;
+  bulge[IDX(3,1,4)] = 0;
+  bulge[IDX(3,2,4)] = beta[pos+3];
+  bulge[IDX(0,3,4)] = 0;
+  bulge[IDX(1,3,4)] = 0;
+  bulge[IDX(2,3,4)] = beta[pos+3];
+  bulge[IDX(3,3,4)] = alpha[pos+4];
+
+  // Apply Householder transform
+  findHouseholder3<IndexType_,ValueType_>(householder, beta+pos,
+            householderMatrix);
+  applyHouseholder3<IndexType_,ValueType_>(householder, bulge);
+  Lapack<ValueType_>::gemm(false, false, n, 3, 3,
+         1, V+IDX(0,pos+1,n), n,
+         householderMatrix, 3,
+         0, work, n);
+  memcpy(V+IDX(0,pos+1,n), work, 3*n*sizeof(ValueType_));
+
+      }
+
+      // Apply penultimate Householder transform
+      //   Values in the last row and column are zero
+      alpha[n-4]     = bulge[IDX(0,0,4)];
+      householder[0] = bulge[IDX(1,0,4)];
+      householder[1] = bulge[IDX(2,0,4)];
+      householder[2] = bulge[IDX(3,0,4)];
+      for(j=0; j<3; ++j)
+  for(i=0; i<3; ++i)
+    bulge[IDX(i,j,4)] = bulge[IDX(i+1,j+1,4)];
+      bulge[IDX(3,0,4)] = 0;
+      bulge[IDX(3,1,4)] = 0;
+      bulge[IDX(3,2,4)] = 0;
+      bulge[IDX(0,3,4)] = 0;
+      bulge[IDX(1,3,4)] = 0;
+      bulge[IDX(2,3,4)] = 0;
+      bulge[IDX(3,3,4)] = 0;
+      findHouseholder3<IndexType_,ValueType_>(householder, beta+n-4,
+                householderMatrix);
+      applyHouseholder3<IndexType_,ValueType_>(householder, bulge);
+      Lapack<ValueType_>::gemm(false, false, n, 3, 3,
+             1, V+IDX(0,n-3,n), n,
+             householderMatrix, 3,
+             0, work, n);
+      memcpy(V+IDX(0,n-3,n), work, 3*n*sizeof(ValueType_));
+
+      // Apply final Householder transform
+      //   Values in the last two rows and columns are zero
+      alpha[n-3]     = bulge[IDX(0,0,4)];
+      householder[0] = bulge[IDX(1,0,4)];
+      householder[1] = bulge[IDX(2,0,4)];
+      householder[2] = 0;
+      for(j=0; j<3; ++j)
+  for(i=0; i<3; ++i)
+    bulge[IDX(i,j,4)] = bulge[IDX(i+1,j+1,4)];
+      findHouseholder3<IndexType_,ValueType_>(householder, beta+n-3,
+                householderMatrix);
+      applyHouseholder3<IndexType_,ValueType_>(householder, bulge);
+      Lapack<ValueType_>::gemm(false, false, n, 2, 2,
+             1, V+IDX(0,n-2,n), n,
+             householderMatrix, 3,
+             0, work, n);
+      memcpy(V+IDX(0,n-2,n), work, 2*n*sizeof(ValueType_));
+
+      // Bulge has been eliminated
+      alpha[n-2] = bulge[IDX(0,0,4)];
+      alpha[n-1] = bulge[IDX(1,1,4)];
+      beta[n-2]  = bulge[IDX(1,0,4)]; 
+
+      return 0;
+
+    }
+
+    /// Perform implicit restart of Lanczos algorithm
+    /** Shifts are Chebyshev nodes of unwanted region of matrix spectrum.
+     *
+     *  @param n Matrix dimension.
+     *  @param iter Current Lanczos iteration.
+     *  @param iter_new Lanczos iteration after restart.
+     *  @param shiftUpper Pointer to upper bound for unwanted
+     *    region. Value is ignored if less than *shiftLower. If a
+     *    stronger upper bound has been found, the value is updated on
+     *    exit.
+     *  @param shiftLower Pointer to lower bound for unwanted
+     *    region. Value is ignored if greater than *shiftUpper. If a
+     *    stronger lower bound has been found, the value is updated on
+     *    exit.
+     *  @param alpha_host (Input/output, host memory, iter entries)
+     *    Diagonal entries of Lanczos system.
+     *  @param beta_host (Input/output, host memory, iter entries)
+     *    Off-diagonal entries of Lanczos system.
+     *  @param V_host (Output, host memory, iter*iter entries)
+     *    Orthonormal transform used to obtain restarted system. Matrix
+     *    dimensions are iter x iter.
+     *  @param work_host (Output, host memory, 4*iter entries)
+     *    Workspace.
+     *  @param lanczosVecs_dev (Input/output, device memory, n*(iter+1)
+     *    entries) Lanczos vectors. Vectors are stored as columns of a
+     *    column-major matrix with dimensions n x (iter+1).
+     *  @param work_dev (Output, device memory, (n+iter)*iter entries)
+     *    Workspace.
+     */
+    template <typename IndexType_, typename ValueType_> static
+    int lanczosRestart(IndexType_ n,
+           IndexType_ iter,
+           IndexType_ iter_new,
+           ValueType_ * shiftUpper,
+           ValueType_ * shiftLower,
+           ValueType_ * __restrict__ alpha_host,
+           ValueType_ * __restrict__ beta_host,
+           ValueType_ * __restrict__ V_host,
+           ValueType_ * __restrict__ work_host,
+           ValueType_ * __restrict__ lanczosVecs_dev,
+           ValueType_ * __restrict__ work_dev,
+           bool smallest_eig) {
+
+      // -------------------------------------------------------
+      // Variable declaration
+      // -------------------------------------------------------
+
+      // Useful constants
+      const ValueType_ zero   = 0;
+      const ValueType_ one    = 1;
+
+      // Loop index
+      IndexType_ i;
+
+      // Number of implicit restart steps
+      //   Assumed to be even since each call to Francis algorithm is
+      //   equivalent to two calls of QR algorithm
+      IndexType_ restartSteps = iter - iter_new;
+ 
+      // Ritz values from Lanczos method
+      ValueType_ * ritzVals_host = work_host + 3*iter;
+      // Shifts for implicit restart
+      ValueType_ * shifts_host;
+
+      // Orthonormal matrix for similarity transform
+      ValueType_ * V_dev = work_dev + n*iter;
+
+      // -------------------------------------------------------
+      // Implementation
+      // -------------------------------------------------------
+
+      // Compute Ritz values
+      memcpy(ritzVals_host, alpha_host, iter*sizeof(ValueType_));
+      memcpy(work_host, beta_host, (iter-1)*sizeof(ValueType_));
+      Lapack<ValueType_>::sterf(iter, ritzVals_host, work_host);
+
+      // Debug: Print largest eigenvalues
+      //for (int i = iter-iter_new; i < iter; ++i)
+      //  std::cout <<*(ritzVals_host+i)<< " ";
+      //std::cout <<std::endl;
+
+      // Initialize similarity transform with identity matrix
+      memset(V_host, 0, iter*iter*sizeof(ValueType_));
+      for(i=0; i<iter; ++i)
+          V_host[IDX(i,i,iter)] = 1;
+
+      // Determine interval to suppress eigenvalues
+      if (smallest_eig) {
+          if(*shiftLower > *shiftUpper) {
+              *shiftUpper = ritzVals_host[iter-1];
+              *shiftLower = ritzVals_host[iter_new];
+          }
+          else {
+              *shiftUpper = max(*shiftUpper, ritzVals_host[iter-1]);
+              *shiftLower = min(*shiftLower, ritzVals_host[iter_new]);
+          }
+      }
+      else {
+          if(*shiftLower > *shiftUpper) {
+              *shiftUpper = ritzVals_host[iter-iter_new-1];
+              *shiftLower = ritzVals_host[0];
+          }
+          else {
+              *shiftUpper = max(*shiftUpper, ritzVals_host[iter-iter_new-1]);
+              *shiftLower = min(*shiftLower, ritzVals_host[0]);
+          }
+      }
+
+      // Calculate Chebyshev nodes as shifts
+      shifts_host = ritzVals_host;
+      for(i=0; i<restartSteps; ++i) {
+          shifts_host[i] = cos((i+0.5)*static_cast<ValueType_>(M_PI)/restartSteps);
+          shifts_host[i] *= 0.5*((*shiftUpper)-(*shiftLower));
+          shifts_host[i] += 0.5*((*shiftUpper)+(*shiftLower));
+      }
+    
+      // Apply Francis QR algorithm to implicitly restart Lanczos
+      for(i=0; i<restartSteps; i+=2)
+       if(francisQRIteration(iter,
+          shifts_host[i], shifts_host[i+1],
+          alpha_host, beta_host,
+          V_host, work_host))
+            WARNING("error in implicitly shifted QR algorithm");
+
+      // Obtain new residual
+      CHECK_CUDA(cudaMemcpyAsync(V_dev, V_host,
+         iter*iter*sizeof(ValueType_),
+         cudaMemcpyHostToDevice));
+
+          beta_host[iter-1]
+              = beta_host[iter-1]*V_host[IDX(iter-1,iter_new-1,iter)];
+          Cublas::gemv(false, n, iter, beta_host+iter_new-1,
+                       lanczosVecs_dev, n, V_dev+IDX(0,iter_new,iter), 1,
+                       beta_host+iter-1, lanczosVecs_dev+IDX(0,iter,n), 1);
+      
+      // Obtain new Lanczos vectors
+          Cublas::gemm(false, false, n, iter_new, iter,
+                       &one, lanczosVecs_dev, n, V_dev, iter,
+                       &zero, work_dev, n);
+      
+      CHECK_CUDA(cudaMemcpyAsync(lanczosVecs_dev, work_dev,
+         n*iter_new*sizeof(ValueType_),
+         cudaMemcpyDeviceToDevice));
+
+      // Normalize residual to obtain new Lanczos vector
+      CHECK_CUDA(cudaMemcpyAsync(lanczosVecs_dev+IDX(0,iter_new,n),
+         lanczosVecs_dev+IDX(0,iter,n),
+         n*sizeof(ValueType_),
+         cudaMemcpyDeviceToDevice));
+      beta_host[iter_new-1]
+  = Cublas::nrm2(n, lanczosVecs_dev+IDX(0,iter_new,n), 1);
+      Cublas::scal(n, 1/beta_host[iter_new-1],
+       lanczosVecs_dev+IDX(0,iter_new,n), 1);
+
+      return 0;
+
+    }
+
+  }
+
+  // =========================================================
+  // Eigensolver
+  // =========================================================
+
+  /// Compute smallest eigenvectors of symmetric matrix
+  /** Computes eigenvalues and eigenvectors that are least
+   *  positive. If matrix is positive definite or positive
+   *  semidefinite, the computed eigenvalues are smallest in
+   *  magnitude.
+   *
+   *  The largest eigenvalue is estimated by performing several
+   *  Lanczos iterations. An implicitly restarted Lanczos method is
+   *  then applied to A+s*I, where s is negative the largest
+   *  eigenvalue.
+   *
+   *  @param A Matrix.
+   *  @param nEigVecs Number of eigenvectors to compute.
+   *  @param maxIter Maximum number of Lanczos steps. Does not include
+   *    Lanczos steps used to estimate largest eigenvalue.
+   *  @param restartIter Maximum size of Lanczos system before
+   *    performing an implicit restart. Should be at least 4.
+   *  @param tol Convergence tolerance. Lanczos iteration will
+   *    terminate when the residual norm is less than tol*theta, where
+   *    theta is an estimate for the smallest unwanted eigenvalue
+   *    (i.e. the (nEigVecs+1)th smallest eigenvalue).
+   *  @param reorthogonalize Whether to reorthogonalize Lanczos
+   *    vectors.
+   *  @param effIter On exit, pointer to final size of Lanczos system.
+   *  @param totalIter On exit, pointer to total number of Lanczos
+   *    iterations performed. Does not include Lanczos steps used to
+   *    estimate largest eigenvalue.
+   *  @param shift On exit, pointer to matrix shift (estimate for
+   *    largest eigenvalue).
+   *  @param alpha_host (Output, host memory, restartIter entries)
+   *    Diagonal entries of Lanczos system.
+   *  @param beta_host (Output, host memory, restartIter entries)
+   *    Off-diagonal entries of Lanczos system.
+   *  @param lanczosVecs_dev (Output, device memory, n*(restartIter+1)
+   *    entries) Lanczos vectors. Vectors are stored as columns of a
+   *    column-major matrix with dimensions n x (restartIter+1).
+   *  @param work_dev (Output, device memory,
+   *    (n+restartIter)*restartIter entries) Workspace.
+   *  @param eigVals_dev (Output, device memory, nEigVecs entries)
+   *    Largest eigenvalues of matrix.
+   *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
+   *    Eigenvectors corresponding to smallest eigenvalues of
+   *    matrix. Vectors are stored as columns of a column-major matrix
+   *    with dimensions n x nEigVecs.
+   *  @return NVGRAPH error flag.
+   */
+  template <typename IndexType_, typename ValueType_>
+  NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix<IndexType_,ValueType_> * A,
+           IndexType_ nEigVecs,
+           IndexType_ maxIter,
+           IndexType_ restartIter,
+           ValueType_ tol,
+           bool reorthogonalize,
+           IndexType_ * effIter,
+           IndexType_ * totalIter,
+           ValueType_ * shift,
+           ValueType_ * __restrict__ alpha_host,
+           ValueType_ * __restrict__ beta_host,
+           ValueType_ * __restrict__ lanczosVecs_dev,
+           ValueType_ * __restrict__ work_dev,
+           ValueType_ * __restrict__ eigVals_dev,
+           ValueType_ * __restrict__ eigVecs_dev) {
+
+    // -------------------------------------------------------
+    // Variable declaration
+    // -------------------------------------------------------
+
+    // Useful constants
+    const ValueType_ one  = 1;
+    const ValueType_ zero = 0;
+
+    // Matrix dimension
+    IndexType_ n = A->n;
+
+    // Shift for implicit restart
+    ValueType_ shiftUpper;
+    ValueType_ shiftLower;
+
+    // Lanczos iteration counters
+    IndexType_ maxIter_curr = restartIter;  // Maximum size of Lanczos system
+
+    // Status flags
+    int status;
+
+    // Loop index
+    IndexType_ i;
+
+    // Host memory
+    ValueType_ * Z_host;           // Eigenvectors in Lanczos basis
+    ValueType_ * work_host;        // Workspace
+
+
+    // -------------------------------------------------------
+    // Check that LAPACK is enabled
+    // -------------------------------------------------------
+    //Lapack<ValueType_>::check_lapack_enabled();
+
+    // -------------------------------------------------------
+    // Check that parameters are valid
+    // -------------------------------------------------------
+    if(A->m != A->n) {
+      WARNING("invalid parameter (matrix is not square)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(nEigVecs < 1) {
+      WARNING("invalid parameter (nEigVecs<1)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(restartIter < 1) {
+      WARNING("invalid parameter (restartIter<4)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(tol < 0) {
+      WARNING("invalid parameter (tol<0)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(nEigVecs > n) {
+      WARNING("invalid parameters (nEigVecs>n)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(maxIter < nEigVecs) {
+      WARNING("invalid parameters (maxIter<nEigVecs)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(restartIter < nEigVecs) {
+      WARNING("invalid parameters (restartIter<nEigVecs)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+
+    // -------------------------------------------------------
+    // Variable initialization
+    // -------------------------------------------------------
+
+    // Total number of Lanczos iterations
+    *totalIter = 0;
+
+    // Allocate host memory
+    Z_host = (ValueType_*) malloc(restartIter*restartIter *sizeof(ValueType_));
+    if(Z_host==NULL) WARNING("could not allocate host memory");
+    work_host = (ValueType_*) malloc(4*restartIter*sizeof(ValueType_));
+    if(work_host==NULL) WARNING("could not allocate host memory");
+
+    // Initialize cuBLAS
+    Cublas::set_pointer_mode_host();
+
+
+    // -------------------------------------------------------
+    // Compute largest eigenvalue to determine shift
+    // -------------------------------------------------------
+
+   
+    #ifdef USE_CURAND
+      // Random number generator
+      curandGenerator_t randGen;
+      // Initialize random number generator
+      CHECK_CURAND(curandCreateGenerator(&randGen,
+                 CURAND_RNG_PSEUDO_PHILOX4_32_10));
+      CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(randGen,
+                  123456/*time(NULL)*/));
+      // Initialize initial Lanczos vector
+      CHECK_CURAND(curandGenerateNormalX(randGen, lanczosVecs_dev, n+n%2, zero, one));
+      ValueType_ normQ1 = Cublas::nrm2(n, lanczosVecs_dev, 1);
+      Cublas::scal(n, 1/normQ1, lanczosVecs_dev, 1);
+    #else
+        fill_raw_vec (lanczosVecs_dev, n, (ValueType_)1.0/n); // doesn't work
+    #endif
+
+
+    // Estimate number of Lanczos iterations 
+    //   See bounds in Kuczynski and Wozniakowski (1992).
+    //const ValueType_ relError = 0.25;  // Relative error
+    //const ValueType_ failProb = 1e-4;  // Probability of failure
+    //maxIter_curr = log(n/pow(failProb,2))/(4*std::sqrt(relError)) + 1;
+    //maxIter_curr = min(maxIter_curr, restartIter);
+
+    // Obtain tridiagonal matrix with Lanczos
+    *effIter  = 0;
+    *shift = 0;
+    status =
+      performLanczosIteration<IndexType_, ValueType_>
+      (A, effIter, maxIter_curr, *shift, 0.0, reorthogonalize, 
+       alpha_host, beta_host, lanczosVecs_dev, work_dev);
+    if(status) WARNING("error in Lanczos iteration");
+
+    // Determine largest eigenvalue
+
+    Lapack<ValueType_>::sterf(*effIter, alpha_host, beta_host);
+    *shift = -alpha_host[*effIter-1];
+    //std::cout <<  *shift <<std::endl;
+    // -------------------------------------------------------
+    // Compute eigenvectors of shifted matrix
+    // -------------------------------------------------------
+
+    // Obtain tridiagonal matrix with Lanczos
+    *effIter = 0;
+    //maxIter_curr = min(maxIter, restartIter);
+    status =
+      performLanczosIteration<IndexType_, ValueType_>
+      (A, effIter, maxIter_curr, *shift, 0, reorthogonalize,
+       alpha_host, beta_host, lanczosVecs_dev, work_dev);
+    if(status) WARNING("error in Lanczos iteration");
+    *totalIter += *effIter;
+
+    // Apply Lanczos method until convergence
+    shiftLower = 1;
+    shiftUpper = -1;
+    while(*totalIter<maxIter && beta_host[*effIter-1]>tol*shiftLower) {
+
+      // Determine number of restart steps
+      // Number of steps must be even due to Francis algorithm
+      IndexType_ iter_new = nEigVecs+1;
+      if(restartIter-(maxIter-*totalIter) > nEigVecs+1)
+  iter_new = restartIter-(maxIter-*totalIter);
+      if((restartIter-iter_new) % 2)
+  iter_new -= 1;
+      if(iter_new==*effIter)
+  break;
+      
+      // Implicit restart of Lanczos method
+      status = 
+  lanczosRestart<IndexType_, ValueType_>
+  (n, *effIter, iter_new,
+   &shiftUpper, &shiftLower, 
+   alpha_host, beta_host, Z_host, work_host,
+   lanczosVecs_dev, work_dev, true);
+      if(status) WARNING("error in Lanczos implicit restart");
+      *effIter = iter_new;
+
+      // Check for convergence
+      if(beta_host[*effIter-1] <= tol*fabs(shiftLower))
+  break;
+
+      // Proceed with Lanczos method
+      //maxIter_curr = min(restartIter, maxIter-*totalIter+*effIter);
+      status = 
+  performLanczosIteration<IndexType_, ValueType_>
+  (A, effIter, maxIter_curr,
+   *shift, tol*fabs(shiftLower), reorthogonalize,
+   alpha_host, beta_host, lanczosVecs_dev, work_dev);
+      if(status) WARNING("error in Lanczos iteration");
+      *totalIter += *effIter-iter_new;
+
+    }
+
+    // Warning if Lanczos has failed to converge
+    if(beta_host[*effIter-1] > tol*fabs(shiftLower))
+    {
+      WARNING("implicitly restarted Lanczos failed to converge");
+    }
+
+    // Solve tridiagonal system
+    memcpy(work_host+2*(*effIter), alpha_host, (*effIter)*sizeof(ValueType_));
+    memcpy(work_host+3*(*effIter), beta_host, (*effIter-1)*sizeof(ValueType_));
+    Lapack<ValueType_>::steqr('I', *effIter,
+            work_host+2*(*effIter), work_host+3*(*effIter),
+            Z_host, *effIter, work_host);
+
+    // Obtain desired eigenvalues by applying shift
+    for(i=0; i<*effIter; ++i)
+      work_host[i+2*(*effIter)] -= *shift;
+    for(i=*effIter; i<nEigVecs; ++i)
+      work_host[i+2*(*effIter)] = 0;
+
+    // Copy results to device memory
+    CHECK_CUDA(cudaMemcpy(eigVals_dev, work_host+2*(*effIter),
+             nEigVecs*sizeof(ValueType_),
+             cudaMemcpyHostToDevice));
+    //for (int i = 0; i < nEigVecs; ++i)
+    //{
+    //  std::cout <<*(work_host+(2*(*effIter)+i))<< std::endl;
+    //}
+    CHECK_CUDA(cudaMemcpy(work_dev, Z_host,
+             (*effIter)*nEigVecs*sizeof(ValueType_),
+             cudaMemcpyHostToDevice));
+
+    // Convert eigenvectors from Lanczos basis to standard basis
+    Cublas::gemm(false, false, n, nEigVecs, *effIter,
+     &one, lanczosVecs_dev, n, work_dev, *effIter,
+     &zero, eigVecs_dev, n);
+
+    // Clean up and exit
+    free(Z_host);
+    free(work_host);
+    #ifdef USE_CURAND
+      CHECK_CURAND(curandDestroyGenerator(randGen));
+    #endif
+    return NVGRAPH_OK;
+  
+  }
+
+  /// Compute smallest eigenvectors of symmetric matrix
+  /** Computes eigenvalues and eigenvectors that are least
+   *  positive. If matrix is positive definite or positive
+   *  semidefinite, the computed eigenvalues are smallest in
+   *  magnitude.
+   *
+   *  The largest eigenvalue is estimated by performing several
+   *  Lanczos iterations. An implicitly restarted Lanczos method is
+   *  then applied to A+s*I, where s is negative the largest
+   *  eigenvalue.
+   *
+   *  CNMEM must be initialized before calling this function.
+   *
+   *  @param A Matrix.
+   *  @param nEigVecs Number of eigenvectors to compute.
+   *  @param maxIter Maximum number of Lanczos steps. Does not include
+   *    Lanczos steps used to estimate largest eigenvalue.
+   *  @param restartIter Maximum size of Lanczos system before
+   *    performing an implicit restart. Should be at least 4.
+   *  @param tol Convergence tolerance. Lanczos iteration will
+   *    terminate when the residual norm is less than tol*theta, where
+   *    theta is an estimate for the smallest unwanted eigenvalue
+   *    (i.e. the (nEigVecs+1)th smallest eigenvalue).
+   *  @param reorthogonalize Whether to reorthogonalize Lanczos
+   *    vectors.
+   *  @param iter On exit, pointer to total number of Lanczos
+   *    iterations performed. Does not include Lanczos steps used to
+   *    estimate largest eigenvalue.
+   *  @param eigVals_dev (Output, device memory, nEigVecs entries)
+   *    Smallest eigenvalues of matrix.
+   *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
+   *    Eigenvectors corresponding to smallest eigenvalues of
+   *    matrix. Vectors are stored as columns of a column-major matrix
+   *    with dimensions n x nEigVecs.
+   *  @return NVGRAPH error flag.
+   */
+  template <typename IndexType_, typename ValueType_>
+  NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix<IndexType_,ValueType_> & A,
+           IndexType_ nEigVecs,
+           IndexType_ maxIter,
+           IndexType_ restartIter,
+           ValueType_ tol,
+           bool reorthogonalize,
+           IndexType_ & iter,
+           ValueType_ * __restrict__ eigVals_dev,
+           ValueType_ * __restrict__ eigVecs_dev) {
+    
+    // CUDA stream
+    //   TODO: handle non-zero streams
+    cudaStream_t stream = 0;
+
+    // Matrix dimension
+    IndexType_ n = A.n;
+
+    // Check that parameters are valid
+    if(A.m != A.n) {
+      WARNING("invalid parameter (matrix is not square)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(nEigVecs < 1) {
+      WARNING("invalid parameter (nEigVecs<1)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(restartIter < 1) {
+      WARNING("invalid parameter (restartIter<4)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(tol < 0) {
+      WARNING("invalid parameter (tol<0)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(nEigVecs > n) {
+      WARNING("invalid parameters (nEigVecs>n)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(maxIter < nEigVecs) {
+      WARNING("invalid parameters (maxIter<nEigVecs)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(restartIter < nEigVecs) {
+      WARNING("invalid parameters (restartIter<nEigVecs)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+
+    // Allocate memory
+    ValueType_ * alpha_host = (ValueType_*) malloc(restartIter*sizeof(ValueType_));
+    ValueType_ * beta_host = (ValueType_*) malloc(restartIter*sizeof(ValueType_));
+    Vector<ValueType_> lanczosVecs_dev(n*(restartIter+1), stream);
+    Vector<ValueType_> work_dev((n+restartIter)*restartIter, stream);
+
+    // Perform Lanczos method
+    IndexType_ effIter;
+    ValueType_ shift;
+    NVGRAPH_ERROR status
+      = computeSmallestEigenvectors(&A, nEigVecs, maxIter, restartIter,
+            tol, reorthogonalize,
+            &effIter, &iter, &shift,
+            alpha_host, beta_host,
+            lanczosVecs_dev.raw(), work_dev.raw(),
+            eigVals_dev, eigVecs_dev);
+
+    // Clean up and return
+    free(alpha_host);
+    free(beta_host);
+    return status;
+
+  }
+
+    // =========================================================
+  // Eigensolver
+  // =========================================================
+
+  /// Compute largest eigenvectors of symmetric matrix
+  /** Computes eigenvalues and eigenvectors that are least
+   *  positive. If matrix is positive definite or positive
+   *  semidefinite, the computed eigenvalues are largest in
+   *  magnitude.
+   *
+   *  The largest eigenvalue is estimated by performing several
+   *  Lanczos iterations. An implicitly restarted Lanczos method is
+   *  then applied.
+   *
+   *  @param A Matrix.
+   *  @param nEigVecs Number of eigenvectors to compute.
+   *  @param maxIter Maximum number of Lanczos steps. 
+   *  @param restartIter Maximum size of Lanczos system before
+   *    performing an implicit restart. Should be at least 4.
+   *  @param tol Convergence tolerance. Lanczos iteration will
+   *    terminate when the residual norm is less than tol*theta, where
+   *    theta is an estimate for the largest unwanted eigenvalue
+   *    (i.e. the (nEigVecs+1)th largest eigenvalue).
+   *  @param reorthogonalize Whether to reorthogonalize Lanczos
+   *    vectors.
+   *  @param effIter On exit, pointer to final size of Lanczos system.
+   *  @param totalIter On exit, pointer to total number of Lanczos
+   *    iterations performed.
+   *  @param alpha_host (Output, host memory, restartIter entries)
+   *    Diagonal entries of Lanczos system.
+   *  @param beta_host (Output, host memory, restartIter entries)
+   *    Off-diagonal entries of Lanczos system.
+   *  @param lanczosVecs_dev (Output, device memory, n*(restartIter+1)
+   *    entries) Lanczos vectors. Vectors are stored as columns of a
+   *    column-major matrix with dimensions n x (restartIter+1).
+   *  @param work_dev (Output, device memory,
+   *    (n+restartIter)*restartIter entries) Workspace.
+   *  @param eigVals_dev (Output, device memory, nEigVecs entries)
+   *    Largest eigenvalues of matrix.
+   *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
+   *    Eigenvectors corresponding to largest eigenvalues of
+   *    matrix. Vectors are stored as columns of a column-major matrix
+   *    with dimensions n x nEigVecs.
+   *  @return NVGRAPH error flag.
+   */
+  template <typename IndexType_, typename ValueType_>
+  NVGRAPH_ERROR computeLargestEigenvectors(const Matrix<IndexType_,ValueType_> * A,
+           IndexType_ nEigVecs,
+           IndexType_ maxIter,
+           IndexType_ restartIter,
+           ValueType_ tol,
+           bool reorthogonalize,
+           IndexType_ * effIter,
+           IndexType_ * totalIter,
+           ValueType_ * __restrict__ alpha_host,
+           ValueType_ * __restrict__ beta_host,
+           ValueType_ * __restrict__ lanczosVecs_dev,
+           ValueType_ * __restrict__ work_dev,
+           ValueType_ * __restrict__ eigVals_dev,
+           ValueType_ * __restrict__ eigVecs_dev) {
+
+    // -------------------------------------------------------
+    // Variable declaration
+    // -------------------------------------------------------
+
+    // Useful constants
+    const ValueType_ one  = 1;
+    const ValueType_ zero = 0;
+
+    // Matrix dimension
+    IndexType_ n = A->n;
+
+    // Lanczos iteration counters
+    IndexType_ maxIter_curr = restartIter;  // Maximum size of Lanczos system
+
+    // Status flags
+    int status;
+
+    // Loop index
+    IndexType_ i;
+
+    // Host memory
+    ValueType_ * Z_host;           // Eigenvectors in Lanczos basis
+    ValueType_ * work_host;        // Workspace
+
+
+    // -------------------------------------------------------
+    // Check that LAPACK is enabled
+    // -------------------------------------------------------
+    //Lapack<ValueType_>::check_lapack_enabled();
+
+    // -------------------------------------------------------
+    // Check that parameters are valid
+    // -------------------------------------------------------
+    if(A->m != A->n) {
+      WARNING("invalid parameter (matrix is not square)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(nEigVecs < 1) {
+      WARNING("invalid parameter (nEigVecs<1)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(restartIter < 1) {
+      WARNING("invalid parameter (restartIter<4)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(tol < 0) {
+      WARNING("invalid parameter (tol<0)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(nEigVecs > n) {
+      WARNING("invalid parameters (nEigVecs>n)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(maxIter < nEigVecs) {
+      WARNING("invalid parameters (maxIter<nEigVecs)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(restartIter <= nEigVecs) {
+      WARNING("invalid parameters (restartIter<=nEigVecs)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+
+    // -------------------------------------------------------
+    // Variable initialization
+    // -------------------------------------------------------
+
+    // Total number of Lanczos iterations
+    *totalIter = 0;
+
+    // Allocate host memory
+    Z_host = (ValueType_*) malloc(restartIter*restartIter *sizeof(ValueType_));
+    if(Z_host==NULL) WARNING("could not allocate host memory");
+    work_host = (ValueType_*) malloc(4*restartIter*sizeof(ValueType_));
+    if(work_host==NULL) WARNING("could not allocate host memory");
+
+    // Initialize cuBLAS
+    Cublas::set_pointer_mode_host();
+
+
+    // -------------------------------------------------------
+    // Compute largest eigenvalue 
+    // -------------------------------------------------------
+
+   
+    #ifdef USE_CURAND
+      // Random number generator
+      curandGenerator_t randGen;
+      // Initialize random number generator
+      CHECK_CURAND(curandCreateGenerator(&randGen,
+                 CURAND_RNG_PSEUDO_PHILOX4_32_10));
+      CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(randGen,
+                  123456));
+       // Initialize initial Lanczos vector
+      CHECK_CURAND(curandGenerateNormalX(randGen, lanczosVecs_dev, n+n%2, zero, one));
+      ValueType_ normQ1 = Cublas::nrm2(n, lanczosVecs_dev, 1);
+      Cublas::scal(n, 1/normQ1, lanczosVecs_dev, 1);
+    #else
+        fill_raw_vec (lanczosVecs_dev, n, (ValueType_)1.0/n); // doesn't work
+    #endif
+
+
+    // Estimate number of Lanczos iterations 
+    //   See bounds in Kuczynski and Wozniakowski (1992).
+    //const ValueType_ relError = 0.25;  // Relative error
+    //const ValueType_ failProb = 1e-4;  // Probability of failure
+    //maxIter_curr = log(n/pow(failProb,2))/(4*std::sqrt(relError)) + 1;
+    //maxIter_curr = min(maxIter_curr, restartIter);
+
+    // Obtain tridiagonal matrix with Lanczos
+    *effIter  = 0;
+    ValueType_ shift_val=0.0;
+    ValueType_ *shift = &shift_val;
+    //maxIter_curr = min(maxIter, restartIter);
+    status =
+      performLanczosIteration<IndexType_, ValueType_>
+      (A, effIter, maxIter_curr, *shift, 0, reorthogonalize,
+       alpha_host, beta_host, lanczosVecs_dev, work_dev);
+    if(status) WARNING("error in Lanczos iteration");
+    *totalIter += *effIter;
+
+    // Apply Lanczos method until convergence
+    ValueType_ shiftLower = 1;
+    ValueType_ shiftUpper = -1;
+    while(*totalIter<maxIter && beta_host[*effIter-1]>tol*shiftLower) {
+
+      // Determine number of restart steps
+      //   Number of steps must be even due to Francis algorithm
+      IndexType_ iter_new = nEigVecs+1;
+      if(restartIter-(maxIter-*totalIter) > nEigVecs+1)
+  iter_new = restartIter-(maxIter-*totalIter);
+      if((restartIter-iter_new) % 2)
+  iter_new -= 1;
+      if(iter_new==*effIter)
+  break;
+      
+      // Implicit restart of Lanczos method
+      status = 
+  lanczosRestart<IndexType_, ValueType_>
+  (n, *effIter, iter_new,
+   &shiftUpper, &shiftLower, 
+   alpha_host, beta_host, Z_host, work_host,
+   lanczosVecs_dev, work_dev, false);
+      if(status) WARNING("error in Lanczos implicit restart");
+      *effIter = iter_new;
+
+      // Check for convergence
+      if(beta_host[*effIter-1] <= tol*fabs(shiftLower))
+  break;
+
+      // Proceed with Lanczos method
+      //maxIter_curr = min(restartIter, maxIter-*totalIter+*effIter);
+      status = 
+  performLanczosIteration<IndexType_, ValueType_>
+  (A, effIter, maxIter_curr,
+   *shift, tol*fabs(shiftLower), reorthogonalize,
+   alpha_host, beta_host, lanczosVecs_dev, work_dev);
+      if(status) WARNING("error in Lanczos iteration");
+      *totalIter += *effIter-iter_new;
+
+    }
+
+    // Warning if Lanczos has failed to converge
+    if(beta_host[*effIter-1] > tol*fabs(shiftLower))
+    {
+      WARNING("implicitly restarted Lanczos failed to converge");
+    }
+    for (int i = 0; i < restartIter; ++i)
+    {
+      for (int j = 0; j < restartIter; ++j)
+        Z_host[i*restartIter+j] = 0;
+      
+    }
+    // Solve tridiagonal system
+    memcpy(work_host+2*(*effIter), alpha_host, (*effIter)*sizeof(ValueType_));
+    memcpy(work_host+3*(*effIter), beta_host, (*effIter-1)*sizeof(ValueType_));
+    Lapack<ValueType_>::steqr('I', *effIter,
+            work_host+2*(*effIter), work_host+3*(*effIter),
+            Z_host, *effIter, work_host);
+
+    // note: We need to pick the top nEigVecs eigenvalues
+    // but effItter can be larger than nEigVecs 
+    // hence we add an offset for that case, because we want to access top nEigVecs eigenpairs in the matrix of size effIter. 
+    // remember the array is sorted, so it is not needed for smallest eigenvalues case because the first ones are the smallest ones 
+
+    IndexType_ top_eigenparis_idx_offset = *effIter - nEigVecs;
+
+    //Debug : print nEigVecs largest eigenvalues
+    //for (int i = top_eigenparis_idx_offset; i < *effIter; ++i)
+    //  std::cout <<*(work_host+(2*(*effIter)+i))<< " ";
+    //std::cout <<std::endl;
+
+    //Debug : print nEigVecs largest eigenvectors
+    //for (int i = top_eigenparis_idx_offset; i < *effIter; ++i)
+    //{
+    //  for (int j = 0; j < *effIter; ++j)
+    //    std::cout <<Z_host[i*(*effIter)+j]<< " ";
+    //  std::cout <<std::endl;
+    //}
+
+    // Obtain desired eigenvalues by applying shift
+    for(i=0; i<*effIter; ++i)
+      work_host[i+2*(*effIter)] -= *shift;
+    
+    for(i=0; i<top_eigenparis_idx_offset; ++i)
+      work_host[i+2*(*effIter)] = 0;
+
+    // Copy results to device memory
+    // skip smallest eigenvalue if needed   
+    CHECK_CUDA(cudaMemcpy(eigVals_dev, work_host+2*(*effIter)+top_eigenparis_idx_offset,
+             nEigVecs*sizeof(ValueType_),
+             cudaMemcpyHostToDevice));
+
+    // skip smallest eigenvector if needed   
+    CHECK_CUDA(cudaMemcpy(work_dev, Z_host+(top_eigenparis_idx_offset*(*effIter)),
+             (*effIter)*nEigVecs*sizeof(ValueType_),
+             cudaMemcpyHostToDevice));
+
+    // Convert eigenvectors from Lanczos basis to standard basis
+    Cublas::gemm(false, false, n, nEigVecs, *effIter,
+     &one, lanczosVecs_dev, n, work_dev, *effIter,
+     &zero, eigVecs_dev, n);
+
+    // Clean up and exit
+    free(Z_host);
+    free(work_host);
+    #ifdef USE_CURAND
+      CHECK_CURAND(curandDestroyGenerator(randGen));
+    #endif
+    return NVGRAPH_OK;
+  
+  }
+
+  /// Compute largest eigenvectors of symmetric matrix
+  /** Computes eigenvalues and eigenvectors that are least
+   *  positive. If matrix is positive definite or positive
+   *  semidefinite, the computed eigenvalues are largest in
+   *  magnitude.
+   *
+   *  The largest eigenvalue is estimated by performing several
+   *  Lanczos iterations. An implicitly restarted Lanczos method is
+   *  then applied to A+s*I, where s is negative the largest
+   *  eigenvalue.
+   *
+   *  CNMEM must be initialized before calling this function.
+   *
+   *  @param A Matrix.
+   *  @param nEigVecs Number of eigenvectors to compute.
+   *  @param maxIter Maximum number of Lanczos steps. Does not include
+   *    Lanczos steps used to estimate largest eigenvalue.
+   *  @param restartIter Maximum size of Lanczos system before
+   *    performing an implicit restart. Should be at least 4.
+   *  @param tol Convergence tolerance. Lanczos iteration will
+   *    terminate when the residual norm is less than tol*theta, where
+   *    theta is an estimate for the largest unwanted eigenvalue
+   *    (i.e. the (nEigVecs+1)th largest eigenvalue).
+   *  @param reorthogonalize Whether to reorthogonalize Lanczos
+   *    vectors.
+   *  @param iter On exit, pointer to total number of Lanczos
+   *    iterations performed. Does not include Lanczos steps used to
+   *    estimate largest eigenvalue.
+   *  @param eigVals_dev (Output, device memory, nEigVecs entries)
+   *    Largest eigenvalues of matrix.
+   *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
+   *    Eigenvectors corresponding to largest eigenvalues of
+   *    matrix. Vectors are stored as columns of a column-major matrix
+   *    with dimensions n x nEigVecs.
+   *  @return NVGRAPH error flag.
+   */
+  template <typename IndexType_, typename ValueType_>
+  NVGRAPH_ERROR computeLargestEigenvectors(const Matrix<IndexType_,ValueType_> & A,
+           IndexType_ nEigVecs,
+           IndexType_ maxIter,
+           IndexType_ restartIter,
+           ValueType_ tol,
+           bool reorthogonalize,
+           IndexType_ & iter,
+           ValueType_ * __restrict__ eigVals_dev,
+           ValueType_ * __restrict__ eigVecs_dev) {
+    
+    // CUDA stream
+    //   TODO: handle non-zero streams
+    cudaStream_t stream = 0;
+
+    // Matrix dimension
+    IndexType_ n = A.n;
+
+    // Check that parameters are valid
+    if(A.m != A.n) {
+      WARNING("invalid parameter (matrix is not square)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(nEigVecs < 1) {
+      WARNING("invalid parameter (nEigVecs<1)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(restartIter < 1) {
+      WARNING("invalid parameter (restartIter<4)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(tol < 0) {
+      WARNING("invalid parameter (tol<0)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(nEigVecs > n) {
+      WARNING("invalid parameters (nEigVecs>n)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(maxIter < nEigVecs) {
+      WARNING("invalid parameters (maxIter<nEigVecs)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(restartIter < nEigVecs) {
+      WARNING("invalid parameters (restartIter<nEigVecs)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+
+    // Allocate memory
+    ValueType_ * alpha_host = (ValueType_*) malloc(restartIter*sizeof(ValueType_));
+    ValueType_ * beta_host = (ValueType_*) malloc(restartIter*sizeof(ValueType_));
+    Vector<ValueType_> lanczosVecs_dev(n*(restartIter+1), stream);
+    Vector<ValueType_> work_dev((n+restartIter)*restartIter, stream);
+
+    // Perform Lanczos method
+    IndexType_ effIter;
+    NVGRAPH_ERROR status
+      = computeLargestEigenvectors(&A, nEigVecs, maxIter, restartIter,
+            tol, reorthogonalize,
+            &effIter, &iter,
+            alpha_host, beta_host,
+            lanczosVecs_dev.raw(), work_dev.raw(),
+            eigVals_dev, eigVecs_dev);
+
+    // Clean up and return
+    free(alpha_host);
+    free(beta_host);
+    return status;
+
+  }
+
+  // =========================================================
+  // Explicit instantiation
+  // =========================================================
+
+  template NVGRAPH_ERROR computeSmallestEigenvectors<int,float>
+  (const Matrix<int,float> * A,
+   int nEigVecs, int maxIter, int restartIter, float tol,
+   bool reorthogonalize,
+   int * iter, int * totalIter, float * shift,
+   float * __restrict__ alpha_host,
+   float * __restrict__ beta_host,
+   float * __restrict__ lanczosVecs_dev,
+   float * __restrict__ work_dev,
+   float * __restrict__ eigVals_dev,
+   float * __restrict__ eigVecs_dev);
+  template NVGRAPH_ERROR computeSmallestEigenvectors<int,double>
+  (const Matrix<int,double> * A,
+   int nEigVecs, int maxIter, int restartIter, double tol,
+   bool reorthogonalize,
+   int * iter, int * totalIter, double * shift,
+   double * __restrict__ alpha_host,
+   double * __restrict__ beta_host,
+   double * __restrict__ lanczosVecs_dev,
+   double * __restrict__ work_dev,
+   double * __restrict__ eigVals_dev,
+   double * __restrict__ eigVecs_dev);
+  template NVGRAPH_ERROR computeSmallestEigenvectors<int, float>
+  (const Matrix<int,float> & A,
+   int nEigVecs,
+   int maxIter,
+   int restartIter,
+   float tol,
+   bool reorthogonalize,
+   int & iter,
+   float * __restrict__ eigVals_dev,
+   float * __restrict__ eigVecs_dev);
+  template NVGRAPH_ERROR computeSmallestEigenvectors<int, double>
+  (const Matrix<int,double> & A,
+   int nEigVecs,
+   int maxIter,
+   int restartIter,
+   double tol,
+   bool reorthogonalize,
+   int & iter,
+   double * __restrict__ eigVals_dev,
+   double * __restrict__ eigVecs_dev);
+
+  template NVGRAPH_ERROR computeLargestEigenvectors<int,float>
+  (const Matrix<int,float> * A,
+   int nEigVecs, int maxIter, int restartIter, float tol,
+   bool reorthogonalize,
+   int * iter, int * totalIter,
+   float * __restrict__ alpha_host,
+   float * __restrict__ beta_host,
+   float * __restrict__ lanczosVecs_dev,
+   float * __restrict__ work_dev,
+   float * __restrict__ eigVals_dev,
+   float * __restrict__ eigVecs_dev);
+  template NVGRAPH_ERROR computeLargestEigenvectors<int,double>
+  (const Matrix<int,double> * A,
+   int nEigVecs, int maxIter, int restartIter, double tol,
+   bool reorthogonalize,
+   int * iter, int * totalIter,
+   double * __restrict__ alpha_host,
+   double * __restrict__ beta_host,
+   double * __restrict__ lanczosVecs_dev,
+   double * __restrict__ work_dev,
+   double * __restrict__ eigVals_dev,
+   double * __restrict__ eigVecs_dev);
+  template NVGRAPH_ERROR computeLargestEigenvectors<int, float>
+  (const Matrix<int,float> & A,
+   int nEigVecs,
+   int maxIter,
+   int restartIter,
+   float tol,
+   bool reorthogonalize,
+   int & iter,
+   float * __restrict__ eigVals_dev,
+   float * __restrict__ eigVecs_dev);
+  template NVGRAPH_ERROR computeLargestEigenvectors<int, double>
+  (const Matrix<int,double> & A,
+   int nEigVecs,
+   int maxIter,
+   int restartIter,
+   double tol,
+   bool reorthogonalize,
+   int & iter,
+   double * __restrict__ eigVals_dev,
+   double * __restrict__ eigVecs_dev);
+
+}
+//#endif //NVGRAPH_PARTITION
+
diff --git a/cpp/nvgraph/cpp/src/lobpcg.cu b/cpp/nvgraph/cpp/src/lobpcg.cu
new file mode 100644
index 00000000000..2f80dc3ae64
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/lobpcg.cu
@@ -0,0 +1,983 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+//#if SPECTRAL_USE_LOBPCG
+#include "lobpcg.hxx"
+
+#include <stdio.h>
+#include <time.h>
+#include <math.h>
+
+#include <cuda.h>
+#include <cublas_v2.h>
+#include <cusolverDn.h>
+#include <cusparse.h>
+#include <curand.h>
+//#include "spectral_parameters.h"
+//#include "cuda_helper.h"
+//#include "cublas_helper.h"
+//#include "cusolver_helper.h"
+//#include "cusparse_helper.h"
+//#include "curand_helper.h"
+//#include "magma_helper.h"
+//#define COLLECT_TIME_STATISTICS 1
+#undef COLLECT_TIME_STATISTICS
+
+#ifdef COLLECT_TIME_STATISTICS
+#include <stddef.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/sysinfo.h>
+#endif
+
+static double timer (void) {
+#ifdef COLLECT_TIME_STATISTICS
+    struct timeval tv;
+    cudaDeviceSynchronize();
+    gettimeofday(&tv, NULL);
+    return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
+#else
+    return 0.0; 
+#endif
+}
+
+namespace nvgraph {
+
+    template <typename IndexType_, typename ValueType_, bool Device_>
+    static int print_matrix(IndexType_ m, IndexType_ n, ValueType_ * A, IndexType_ lda, const char *s){
+        IndexType_ i,j;
+        ValueType_ * h_A;
+
+        if (m > lda) {
+            WARNING("print_matrix - invalid parameter (m > lda)");
+            return -1;
+        }
+        if (Device_) {
+            h_A = (ValueType_ *)malloc(lda*n*sizeof(ValueType_));
+            if (!h_A) {
+                WARNING("print_matrix - malloc failed");
+                return -1;
+            }
+            cudaMemcpy(h_A, A, lda*n*sizeof(ValueType_), cudaMemcpyDeviceToHost); cudaCheckError();
+        }
+        else {
+            h_A = A;
+        }
+
+        printf("%s\n",s);
+        for (i=0; i<m; i++) { //assumption m<lda
+            for (j=0; j<n; j++) {
+                 printf("%8.5f ", h_A[i+j*lda]);
+            }
+            printf("\n");
+        }
+
+        if (Device_) {
+            if (h_A) free(h_A);
+        }
+        return 0;
+    }
+
+    template <typename IndexType_, typename ValueType_> 
+    static __global__ void random_matrix_kernel(IndexType_ m, IndexType_ n, ValueType_ * A, IndexType_ lda, IndexType_ seed) {
+        IndexType_ i,j,index;
+
+        for (j=threadIdx.y+blockIdx.y*blockDim.y; j<n; j+=blockDim.y*gridDim.y) {
+            for (i=threadIdx.x+blockIdx.x*blockDim.x; i<m; i+=blockDim.x*gridDim.x) {
+                index = i+j*lda;
+                A[index] = ((ValueType_)(((index+seed) % 253)+1))/256.0;
+            }            
+        }
+    }
+
+
+    template <typename IndexType_, typename ValueType_>
+    int random_matrix(IndexType_ m, IndexType_ n, ValueType_ * A, IndexType_ lda, IndexType_ seed, cudaStream_t s){
+
+        if (m > lda) {
+            WARNING("random_matrix - invalid parameter (m > lda)");
+            return -1;
+        }
+        
+        //device code  
+        dim3 gridDim, blockDim;
+        blockDim.x = 256;
+        blockDim.y = 1;
+        blockDim.z = 1;
+        gridDim.x  = min((m+blockDim.x-1)/blockDim.x, 65535);
+        gridDim.y  = min((n+blockDim.y-1)/blockDim.y, 65535);
+        gridDim.z  = 1;
+        random_matrix_kernel<IndexType_,ValueType_><<<gridDim,blockDim,0,s>>>(m,n,A,lda,seed);
+        cudaCheckError();
+
+        /*
+        //host code
+        IndexType_ i,j,index;
+        ValueType_ * h_A;
+
+        h_A = (ValueType_ *)malloc(lda*n*sizeof(ValueType_));
+        if (!h_A) {
+            WARNING("random_matrix - malloc failed");
+            return -1;
+        }
+        cudaMemcpy(h_A, A, lda*n*sizeof(ValueType_), cudaMemcpyDeviceToHost); cudaCheckError();
+        for (i=0; i<m; i++) {
+            for (j=0; j<n; j++) {
+                index = i+j*lda;
+                h_A[index] = ((ValueType_)(((index+seed) % 253)+1))/256.0;
+                //printf("%d, %d, %f, ",index, (index+seed) % 253, ((ValueType_)(((index+seed) % 253)+1))/256.0);
+            }
+            printf("\n");
+        }
+        cudaMemcpy(A, h_A, lda*n*sizeof(ValueType_), cudaMemcpyHostToDevice); cudaCheckError();
+        */
+        return 0;
+    }
+
+    template <typename IndexType_, typename ValueType_> 
+    static __global__ void block_axmy_kernel(IndexType_ n, IndexType_ k, ValueType_ * alpha, ValueType_ *X, IndexType_ ldx, ValueType_ *Y, IndexType_ ldy) {
+        IndexType_ i,j,index;
+
+        for (j=threadIdx.y+blockIdx.y*blockDim.y; j<k; j+=blockDim.y*gridDim.y) {
+            for (i=threadIdx.x+blockIdx.x*blockDim.x; i<n; i+=blockDim.x*gridDim.x) {
+                index = i+j*ldx;
+                Y[index] = Y[index] - alpha[j]*X[index];
+            }            
+        }
+    }
+
+    template <typename IndexType_, typename ValueType_>
+    int block_axmy(IndexType_ n, IndexType_ k, ValueType_ * alpha, ValueType_ *X, IndexType_ ldx, ValueType_ *Y, IndexType_ ldy, cudaStream_t s) {
+        //device code  
+        dim3 gridDim, blockDim;
+        blockDim.x = 256;
+        blockDim.y = 1;
+        blockDim.z = 1;
+        gridDim.x  = min((n+blockDim.x-1)/blockDim.x, 65535);
+        gridDim.y  = min((k+blockDim.y-1)/blockDim.y, 65535);
+        gridDim.z  = 1;
+        block_axmy_kernel<IndexType_,ValueType_><<<gridDim,blockDim,0,s>>>(n,k,alpha,X,ldx,Y,ldy);
+        cudaCheckError();
+
+        return 0;
+    }
+
+    template <typename IndexType_, typename ValueType_> 
+    static __global__ void collect_sqrt_kernel(IndexType_ n, ValueType_ *A, IndexType_ lda, ValueType_ *E) {
+        IndexType_ i,index;
+
+        for (i=threadIdx.x+blockIdx.x*blockDim.x; i<n; i+=blockDim.x*gridDim.x) {
+            index = i+i*lda;
+            E[i] = std::sqrt(static_cast<ValueType_>(A[index]));
+        }                    
+    }
+
+    template <typename IndexType_, typename ValueType_>
+    int collect_sqrt_memcpy(IndexType_ n, ValueType_ *A, IndexType_ lda, ValueType_ * E, cudaStream_t s) {
+        //device code  
+        dim3 gridDim, blockDim;
+        blockDim.x = min(n,256);
+        blockDim.y = 1;
+        blockDim.z = 1;
+        gridDim.x  = min((n+blockDim.x-1)/blockDim.x, 65535);
+        gridDim.y  = 1;
+        gridDim.z  = 1;
+        collect_sqrt_kernel<IndexType_,ValueType_><<<gridDim,blockDim,0,s>>>(n,A,lda,E);
+        cudaCheckError();
+
+        return 0;
+    }
+
+    template <typename IndexType_, typename ValueType_, bool eigenvecs> 
+    static __global__ void convert_to_ascending_order_kernel(IndexType_ n, ValueType_ * H_dst, IndexType_ ldd, ValueType_ * E_dst, ValueType_ * H_src, IndexType_ lds, ValueType_ * E_src){
+        IndexType_ i,j,indexs,indexd;
+
+        for (i=threadIdx.x+blockIdx.x*blockDim.x; i<n; i+=blockDim.x*gridDim.x) {
+            E_dst[n-(i+1)] = E_src[i];
+        }
+
+        if (eigenvecs) {
+            for (j=threadIdx.y+blockIdx.y*blockDim.y; j<n; j+=blockDim.y*gridDim.y) {
+                for (i=threadIdx.x+blockIdx.x*blockDim.x; i<n; i+=blockDim.x*gridDim.x) {
+                    indexs = i+j*lds;
+                    indexd = i+(n-(j+1))*ldd;
+                    H_dst[indexd] = H_src[indexs];
+                }            
+            }
+        }
+    }
+
+    template <typename IndexType_, typename ValueType_, bool eigenvecs>
+    int convert_to_ascending_order(IndexType_ n, ValueType_ * H_dst, IndexType_ ldd, ValueType_ * E_dst, ValueType_ * H_src, IndexType_ lds, ValueType_ * E_src, cudaStream_t s){
+        //device code  
+        dim3 gridDim, blockDim;
+        blockDim.x = min(n,256);
+        blockDim.y = (256+blockDim.x-1)/blockDim.x;
+        blockDim.z = 1;
+        gridDim.x  = min((n+blockDim.x-1)/blockDim.x, 65535);
+        gridDim.y  = min((n+blockDim.y-1)/blockDim.y, 65535);
+        gridDim.z  = 1;
+        convert_to_ascending_order_kernel<IndexType_,ValueType_,eigenvecs><<<gridDim,blockDim,0,s>>>(n,H_dst,ldd,E_dst,H_src,lds,E_src);
+        cudaCheckError();
+
+        return 0;
+    }
+
+    template <typename IndexType_, typename ValueType_> 
+    static __global__ void compute_cond_kernel (IndexType_ n, ValueType_ *E) {         
+        //WARNING: must be launched with a single thread and block only 
+        E[0] = E[0]/E[n-1];
+    }
+
+    template <typename IndexType_, typename ValueType_>
+    int compute_cond(IndexType_ n, ValueType_ *E, cudaStream_t s) { 
+        //device code  
+        dim3 gridDim, blockDim;
+        blockDim.x = 1;
+        blockDim.y = 1;
+        blockDim.z = 1;
+        gridDim.x  = 1;
+        gridDim.y  = 1;
+        gridDim.z  = 1;
+        compute_cond_kernel<IndexType_,ValueType_><<<gridDim,blockDim,0,s>>>(n,E);
+        cudaCheckError();
+
+        return 0;
+    } 
+
+    template <typename IndexType_, typename ValueType_>
+    int lobpcg_simplified(cublasHandle_t cublasHandle,
+                          cusolverDnHandle_t cusolverHandle,
+                          IndexType_ n, IndexType_ k,
+                          /*const*/ Matrix<IndexType_,ValueType_> * A,
+                          ValueType_ * __restrict__ eigVecs_dev,
+                          ValueType_ * __restrict__ eigVals_dev,
+                          IndexType_ mit, ValueType_ tol,
+                          ValueType_ * __restrict__ work_dev,
+                          IndexType_ & iter) {
+        
+        // -------------------------------------------------------
+        // Variable declaration
+        // -------------------------------------------------------
+        LaplacianMatrix<IndexType_,ValueType_>* L = dynamic_cast< LaplacianMatrix<IndexType_,ValueType_>* >(A);
+        //LaplacianMatrix<IndexType_,ValueType_>* L = static_cast< LaplacianMatrix<IndexType_,ValueType_>* >(A);
+
+        cudaEvent_t event=NULL;
+        cudaStream_t s_alg=NULL,s_cublas=NULL,s_cusolver=NULL,s_cusparse=NULL;
+        //cudaStream_t s_magma=NULL; //magma_types.h: typedef cudaStream_t magma_queue_t;
+
+        // Useful constants
+        const ValueType_ zero = 0.0;
+        const ValueType_ one  = 1.0;
+        const ValueType_ mone =-1.0;
+        const bool sp = (sizeof(ValueType_) == 4);
+        const ValueType_ eps      = (sp) ? 1.1920929e-7f : 2.220446049250313e-16;
+        const ValueType_ max_kappa= (sp) ? 4    : 8;
+        //const bool use_magma = SPECTRAL_USE_MAGMA; //true; //false;
+        const bool use_throttle = SPECTRAL_USE_THROTTLE; //true; //false;
+        const bool use_normalized_laplacian = SPECTRAL_USE_NORMALIZED_LAPLACIAN; //true; //false;
+        const bool use_R_orthogonalization = SPECTRAL_USE_R_ORTHOGONALIZATION; //true; //false;
+
+        // Status flags
+        //int minfo;
+        //int nb;
+        //int lwork;
+        //int liwork;
+        int Lwork;
+        int k3 = 3*k;
+        int k2 = 2*k;
+        int sz = k2;
+        //int nb1;
+        //int nb2;
+        //int nb3;
+        ValueType_ kappa;
+        ValueType_ kappa_average;        
+        //ValueType_ * h_wa=NULL;
+        //ValueType_ * h_work=NULL;
+        //IndexType_ * h_iwork=NULL;
+        //ValueType_ * h_E=NULL;
+
+        // Loop indices
+        IndexType_ i,j,start;
+        
+        //LOBPCG subspaces        
+        ValueType_ * E=NULL;
+        ValueType_ * Y=NULL;
+        ValueType_ * X=NULL;
+        ValueType_ * R=NULL;
+        ValueType_ * P=NULL;
+        ValueType_ * Z=NULL;
+        ValueType_ * AX=NULL;
+        ValueType_ * AR=NULL;
+        ValueType_ * AP=NULL;
+        ValueType_ * Q=NULL;
+        ValueType_ * BX=NULL;
+        ValueType_ * BR=NULL;
+        ValueType_ * BP=NULL;
+        ValueType_ * G=NULL;
+        ValueType_ * H=NULL;
+        ValueType_ * HU=NULL;
+        ValueType_ * HVT=NULL;
+        ValueType_ * nrmR=NULL;
+        ValueType_ * h_nrmR=NULL;
+        ValueType_ * h_kappa_history=NULL;
+        ValueType_ * Workspace=NULL;
+
+        double t_start=0.0,t_end=0.0,t_total=0.0,t_setup=0.0,t_mm=0.0,t_bdot=0.0,t_gemm=0.0,t_potrf=0.0,t_trsm=0.0,t_syevd=0.0,t_custom=0.0,t_prec=0.0,t1=0.0,t2=0.0;
+ 
+        t_start =timer();
+
+        // Random number generator
+        curandGenerator_t randGen;
+        
+        // -------------------------------------------------------
+        // Check that parameters are valid
+        // -------------------------------------------------------
+        if(n < 1) {
+            WARNING("lobpcg_simplified - invalid parameter (n<1)");
+            return -1;
+        }
+        if(k < 1) {
+            WARNING("lobpcg_simplified - invalid parameter (k<1)");
+            return -1;
+        }
+        if(tol < 0) {
+            WARNING("lobpcg_simplified - invalid parameter (tol<0)");
+            return -1;
+        }
+        if(k > n) {
+            WARNING("lobpcg_simplified - invalid parameters (k>n)");
+            return -1;
+        }
+        
+        E = eigVals_dev;      //array, not matrix, of eigenvalues 
+        Y = &work_dev[0];     //alias Y = [X,R,P]
+        X = &work_dev[0];     //notice that X, R and P must be continuous in memory
+        R = &work_dev[k*n];   //R = A*X-B*X*E
+        P = &work_dev[2*k*n];        
+        Z = &work_dev[3*k*n]; //alias Z = A*Y = [AX,AR,AP] 
+        AX= &work_dev[3*k*n]; //track A*X
+        AR= &work_dev[4*k*n]; //track A*R (also used as temporary storage)
+        AP= &work_dev[5*k*n]; //track A*P
+        Q = &work_dev[6*k*n]; //alias Q = B*Y = [BX,BR,BP] 
+        BX= &work_dev[6*k*n]; //track B*X
+        BR= &work_dev[7*k*n]; //track B*R
+        BP= &work_dev[8*k*n]; //track B*P
+        G   = &work_dev[9*k*n];
+        H   = &work_dev[9*k*n +   k3*k3];
+        HU  = &work_dev[9*k*n + 2*k3*k3];
+        HVT = &work_dev[9*k*n + 3*k3*k3];
+        nrmR= &work_dev[9*k*n + 4*k3*k3];
+        Workspace = &work_dev[9*k*n + 4*k3*k3+k];
+
+        // -------------------------------------------------------
+        // Variable initialization
+        // -------------------------------------------------------
+        t1 =timer();
+
+        // create a CUDA stream
+        cudaEventCreate(&event); cudaCheckError();
+        cudaStreamCreate(&s_alg); cudaCheckError();
+        ///s_alg=NULL;
+
+        // set pointer mode in CUBLAS
+        CHECK_CUBLAS(cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_HOST));
+        
+        // save and set streams in CUBLAS and CUSOLVER/MAGMA
+        CHECK_CUBLAS(cublasGetStream(cublasHandle, &s_cublas));
+        CHECK_CUBLAS(cublasSetStream(cublasHandle, s_alg));
+        //if (use_magma) {
+        //    CHECK_CUBLAS(magmablasGetKernelStream(&s_magma)); //returns cublasStatus_t
+        //    CHECK_CUBLAS(magmablasSetKernelStream(s_alg));    //returns cublasStatus_t
+        //}
+        //else {
+            CHECK_CUSOLVER(cusolverDnGetStream(cusolverHandle, &s_cusolver));
+            CHECK_CUSOLVER(cusolverDnSetStream(cusolverHandle, s_alg));
+        //}
+        // save and set streams in Laplacian/CUSPARSE    
+        L->getCUDAStream(&s_cusparse);     
+        L->setCUDAStream(s_alg);    
+
+        // Initialize random number generator
+        CHECK_CURAND(curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10));
+        CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(randGen, 123456/*time(NULL)*/));
+
+        // Initialize initial LOBPCG subspace
+        CHECK_CURAND(curandGenerateNormalX(randGen, X, k*n, zero, one));
+        ///random_matrix<IndexType_,ValueType_>(n,k,X,n,17,s_alg);       
+        //print_matrix<IndexType_,ValueType_,true>(3,3,X,n,"X");
+ 
+        // set nxk matrices P=0, AP=0 and BP=0
+        cudaMemsetAsync(P,  0, n*k*sizeof(ValueType_), s_alg); cudaCheckError();
+        cudaMemsetAsync(AP, 0, n*k*sizeof(ValueType_), s_alg);cudaCheckError();
+        cudaMemsetAsync(BP, 0, n*k*sizeof(ValueType_), s_alg);cudaCheckError();
+
+        //if (use_magma) {
+        //    //NB can be obtained through magma_get_dsytrd_nb(N). 
+        //    //If JOBZ = MagmaVec and N > 1, LWORK >= max( 2*N + N*NB, 1 + 6*N + 2*N**2 ). 
+        //    //If JOBZ = MagmaVec and N > 1, LIWORK >= 3 + 5*N.
+        //    nb1    = magma_get_xsytrd_nb(k, zero);
+        //    nb2    = magma_get_xsytrd_nb(k2,zero);
+        //    nb3    = magma_get_xsytrd_nb(k3,zero);
+        //    nb     = max(nb1,max(nb2,nb3)); //this is needed to ensure allocations are correct even if sz is changed from k, 2*k to 3*k below       
+        //    lwork  = max(2*k3+k3*nb, 1+6*k3+2*k3*k3); 
+        //    liwork = 3 + 5*k3;
+        //    //printf("k=%d, nb=%d, lwork=%d, liwork=%d\n",k,nb,lwork,liwork);
+        //    h_E    = (ValueType_ *)malloc(k3*sizeof(h_E[0]));
+        //    h_wa   = (ValueType_ *)malloc(k3*k3*sizeof(h_wa[0]));
+        //    h_work = (ValueType_ *)malloc(lwork*sizeof(h_work[0])); 
+        //    h_iwork= (IndexType_ *)malloc(liwork*sizeof(h_iwork[0]));
+        //    if ((!h_E) || (!h_wa) || (!h_work) || (!h_iwork)) {
+        //        WARNING("lobpcg_simplified - malloc failed");
+        //        return -1;
+        //    }
+        //}
+
+        if(use_throttle) {
+            cudaHostAlloc(&h_nrmR, 2*sizeof(h_nrmR[0]), cudaHostAllocDefault); //pinned memory 
+            cudaCheckError();
+        }
+        else{
+            h_nrmR = (ValueType_ *)malloc((k+1)*sizeof(h_nrmR[0]));
+        }
+
+        h_kappa_history = (ValueType_ *)malloc((mit+1)*sizeof(h_kappa_history[0]));
+        if ((!h_kappa_history) || (!h_nrmR) ) {
+            WARNING("lobpcg_simplified - malloc/cudaHostAlloc failed");
+            return -1;
+        }
+        h_kappa_history[0] = -log10(eps)/2.0; 
+        //printf("h_kappa_history[0] = %f\n",h_kappa_history[0]);
+        t2 =timer();
+        t_setup+=t2-t1;
+
+        // -------------------------------------------------------
+        // Algorithm
+        // -------------------------------------------------------
+        //BX= B*X
+        if (use_normalized_laplacian) {
+            L->dm(k, one, X, zero, BX);
+        }
+        else { 
+            cudaMemcpyAsync(BX, X, n*k*sizeof(ValueType_), cudaMemcpyDeviceToDevice, s_alg); cudaCheckError(); 
+        }
+        //print_matrix<IndexType_,ValueType_,true>(3,3,BX,n,"BX=B*X");
+
+        //G = X'*BX
+        t1 =timer();
+        CHECK_CUBLAS(cublasXgemm(cublasHandle,CUBLAS_OP_T,CUBLAS_OP_N, k, k, n, &one, X, n, BX, n, &zero, G, k));
+        t2 =timer();
+        t_bdot+=t2-t1;
+        //print_matrix<IndexType_,ValueType_,true>(k,k,G,k,"G=X'*BX");
+
+        //S = chol(G);
+        t1 =timer();
+        //if (false /*use_magma*/) {
+        //    MAGMACHECK(magma_xpotrf(k, G, k, &minfo));
+        //}
+        //else{
+            CHECK_CUSOLVER(cusolverXpotrf_bufferSize(cusolverHandle,k,G,k,&Lwork)); //Workspace was already over allocated earlier
+            CHECK_CUSOLVER(cusolverXpotrf(cusolverHandle,k,G,k,Workspace,Lwork,(int *)&Workspace[Lwork]));
+        //}
+        t2 =timer();
+        t_potrf+=t2-t1;
+        //print_matrix<IndexType_,ValueType_,true>(k,k,G,k,"S=chol(G,lower_part_stored)");
+
+        //X = X/S (notice that in MATLAB S has L', therefore extra transpose (CUBLAS_OP_T) is required below)
+        t1 =timer();
+        CHECK_CUBLAS(cublasXtrsm(cublasHandle,CUBLAS_SIDE_RIGHT,CUBLAS_FILL_MODE_LOWER,CUBLAS_OP_T,CUBLAS_DIAG_NON_UNIT,n,k,&one,G,k, X,n));
+        //BX=BX/S
+        CHECK_CUBLAS(cublasXtrsm(cublasHandle,CUBLAS_SIDE_RIGHT,CUBLAS_FILL_MODE_LOWER,CUBLAS_OP_T,CUBLAS_DIAG_NON_UNIT,n,k,&one,G,k,BX,n));
+        t2 =timer();
+        t_trsm+=t2-t1;
+        //print_matrix<IndexType_,ValueType_,true>(3,3,X, n,"X = X/S");
+        //print_matrix<IndexType_,ValueType_,true>(3,3,BX,n,"BX=BX/S");
+
+        //AX = A*X        
+        t1 =timer();
+        L->mm(k, one, X, zero, AX);
+        t2 =timer();
+        t_mm+=t2-t1;
+        //print_matrix<IndexType_,ValueType_,true>(3,3,AX,n,"AX=A*X");
+
+        //H = X'*AX
+        t1 =timer();
+        CHECK_CUBLAS(cublasXgemm(cublasHandle,CUBLAS_OP_T,CUBLAS_OP_N, k, k, n, &one, X, n, AX, n, &zero, H, k));
+        t2 =timer();
+        t_bdot+=t2-t1;
+        //print_matrix<IndexType_,ValueType_,true>(k,k,H,k,"H=X'*A*X");
+
+        //[W,E]=eig(H)
+        t1 =timer();
+        //if (use_magma) {
+        //    MAGMACHECK(magma_xsyevd(k, H, k, h_E, h_wa, k, h_work, lwork, h_iwork, liwork, &minfo));
+        //    cudaMemcpy(E, h_E, k*sizeof(ValueType_), cudaMemcpyHostToDevice); cudaCheckError();
+         //}
+        //else {
+            //WARNING: using eigVecs_dev as a temporary space
+            CHECK_CUSOLVER(cusolverXgesvd_bufferSize(cusolverHandle,k,k,H,k,HU,k,HVT,k,&Lwork)); //Workspace was already over allocated earlier
+            CHECK_CUSOLVER(cusolverXgesvd(cusolverHandle,k,k,H,k,eigVecs_dev,HU,k,HVT,k,Workspace,Lwork,NULL,(int *)&Workspace[Lwork]));
+            convert_to_ascending_order<IndexType_,ValueType_,true>(k,H,k,E,HU,k,eigVecs_dev,s_alg);
+        //}
+        t2 =timer();
+        t_syevd+=t2-t1;
+        //print_matrix<IndexType_,ValueType_,true>(k,1,E,k,"E, from [W,E]=eig(H)");
+        //print_matrix<IndexType_,ValueType_,true>(k,k,H,k,"W, from [W,E]=eig(H)");
+          
+        //X = X*W 
+        t1 =timer();
+        CHECK_CUBLAS(cublasXgemm(cublasHandle,CUBLAS_OP_N,CUBLAS_OP_N, n, k, k, &one, X, n, H, k, &zero, AR, n));
+        cudaMemcpyAsync(X, AR, n*k*sizeof(ValueType_), cudaMemcpyDeviceToDevice, s_alg); cudaCheckError(); 
+        //BX = BX*W
+        CHECK_CUBLAS(cublasXgemm(cublasHandle,CUBLAS_OP_N,CUBLAS_OP_N, n, k, k, &one,BX, n, H, k, &zero, AR, n));
+        cudaMemcpyAsync(BX,AR, n*k*sizeof(ValueType_), cudaMemcpyDeviceToDevice, s_alg); cudaCheckError(); 
+        //AX = AX*W (notice that R=AX below, which we will use later on when computing residual R)
+        CHECK_CUBLAS(cublasXgemm(cublasHandle,CUBLAS_OP_N,CUBLAS_OP_N, n, k, k, &one, AX, n, H, k, &zero, R, n));
+        cudaMemcpyAsync(AX, R, n*k*sizeof(ValueType_), cudaMemcpyDeviceToDevice, s_alg); cudaCheckError(); 
+        t2 =timer();
+        t_gemm+=t2-t1;
+        //print_matrix<IndexType_,ValueType_,true>(3,3,X, n,"X = X*W");
+        //print_matrix<IndexType_,ValueType_,true>(3,3,BX,n,"BX=BX*W");
+        //print_matrix<IndexType_,ValueType_,true>(3,3,AX,n,"AX=AX*W");
+
+        // start main loop
+        for(i=0; i<mit; i++){
+            //save iteration number (an output parameter)
+            iter = i;
+
+            //R = AX - BX*E
+            t1 =timer();
+            block_axmy<IndexType_,ValueType_>(n,k,E,BX,n,R,n,s_alg);
+            t2 =timer();
+            t_custom+=t2-t1;
+            //print_matrix<IndexType_,ValueType_,true>(3,3,R,n,"R=AX-X*E");
+
+            //check convergence
+            t1 =timer();
+            if (use_throttle) { //use throttle technique
+                if ((i % 2) == 0) {
+                    //notice can not use G=R'*BR, because it is != R'*R, which is needed at this point
+                    CHECK_CUBLAS(cublasXgemm(cublasHandle,CUBLAS_OP_T,CUBLAS_OP_N, k, k, n, &one, R, n, R, n, &zero, G, k));
+                    collect_sqrt_memcpy<IndexType_,ValueType_>(k,G,k,nrmR,s_alg);
+                    cudaMemcpyAsync(h_nrmR, &nrmR[k-1], sizeof(ValueType_), cudaMemcpyDeviceToHost, s_alg); cudaCheckError();
+                    cudaEventRecord(event, s_alg); cudaCheckError();
+                }
+                if (((i+1) % 2) == 0) {
+                    cudaEventSynchronize(event); cudaCheckError();
+                    if (h_nrmR[0] < tol) {
+                        break;
+                    }            
+                }
+            }
+            else { //use naive approach
+                for (j=0; j<k; j++) {
+                    CHECK_CUBLAS(cublasXnrm2(cublasHandle, n, &R[j*n], 1, &h_nrmR[j])); 
+                    //printf("h_nrmR[%d]=%f \n", j,h_nrmR[j]);
+                }
+                if (h_nrmR[k-1] < tol) {
+                    break;
+                }     
+            }          
+            t2 =timer();
+            t_custom+=t2-t1;
+
+            //R=M\R preconditioning step
+            t1 =timer();
+            L->prec_solve(k,one,R,eigVecs_dev);
+            t2 =timer();
+            t_prec+=t2-t1;
+            //print_matrix<IndexType_,ValueType_,true>(3,3,R,n,"R=M\R");
+ 
+            //make residuals B orthogonal to X (I'm not sure this is needed)
+            //R = R - X*(BX'*R);
+            if (use_R_orthogonalization) {
+                t1 =timer();
+                CHECK_CUBLAS(cublasXgemm(cublasHandle,CUBLAS_OP_T,CUBLAS_OP_N, k, k, n, &one, BX, n, R, n, &zero, G, k));
+                t2 =timer();
+                t_bdot+=t2-t1;
+                
+                t1 =timer();
+                CHECK_CUBLAS(cublasXgemm(cublasHandle,CUBLAS_OP_N,CUBLAS_OP_N, n, k, k, &mone, X, n, G, k, &one, R, n));
+                t2 =timer();
+                t_gemm+=t2-t1;
+            }
+
+            //BX= B*X
+            if (use_normalized_laplacian) {
+                L->dm(k, one, R, zero, BR);
+            }
+            else { 
+                cudaMemcpyAsync(BR, R, n*k*sizeof(ValueType_), cudaMemcpyDeviceToDevice, s_alg); cudaCheckError(); 
+            }
+            //G=R'*BR
+            t1 =timer();
+            CHECK_CUBLAS(cublasXgemm(cublasHandle,CUBLAS_OP_T,CUBLAS_OP_N, k, k, n, &one, R, n, BR, n, &zero, G, k));
+            t2 =timer();
+            t_bdot+=t2-t1;
+            //print_matrix<IndexType_,ValueType_,true>(k,k,G,k,"G=R'*BR");
+           
+            //S = chol(G);
+            t1 =timer();
+            //if (false /*use_magma*/) {
+            //    MAGMACHECK(magma_xpotrf(k, G, k, &minfo));
+            //}
+            //else{
+                CHECK_CUSOLVER(cusolverXpotrf_bufferSize(cusolverHandle,k,G,k,&Lwork)); //Workspace was already over allocated earlier
+                CHECK_CUSOLVER(cusolverXpotrf(cusolverHandle,k,G,k,Workspace,Lwork,(int *)&Workspace[Lwork]));
+            // }
+            t2 =timer();
+            t_potrf+=t2-t1;
+            //print_matrix<IndexType_,ValueType_,true>(k,k,G,k,"S=chol(G,lower_part_stored)");
+
+            //R = R/S (notice that in MATLAB S has L', therefore extra transpose (CUBLAS_OP_T) is required below)
+            t1 =timer();
+            CHECK_CUBLAS(cublasXtrsm(cublasHandle,CUBLAS_SIDE_RIGHT,CUBLAS_FILL_MODE_LOWER,CUBLAS_OP_T,CUBLAS_DIAG_NON_UNIT,n,k,&one,G,k,R,n));
+            //BR=BR/S
+            CHECK_CUBLAS(cublasXtrsm(cublasHandle,CUBLAS_SIDE_RIGHT,CUBLAS_FILL_MODE_LOWER,CUBLAS_OP_T,CUBLAS_DIAG_NON_UNIT,n,k,&one,G,k,BR,n));
+            t2 =timer();
+            t_trsm+=t2-t1;
+            //print_matrix<IndexType_,ValueType_,true>(3,3, R,n,"R = R/S");
+            //print_matrix<IndexType_,ValueType_,true>(3,3,BR,n,"BR=BR/S");
+
+            //G=Y'*Q (where Q=B*Y) 
+            //std::cout<<"size : "<< sz<< std::endl;
+            //print_matrix<IndexType_,ValueType_,true>(sz,sz,Y,sz,"Y");
+            //print_matrix<IndexType_,ValueType_,true>(sz,sz,Q,sz,"Q");
+            t1 =timer();
+            CHECK_CUBLAS(cublasXgemm(cublasHandle,CUBLAS_OP_T,CUBLAS_OP_N, sz, sz, n, &one, Y, n, Q, n, &zero, G, sz));
+            t2 =timer();
+            t_bdot+=t2-t1;
+            //print_matrix<IndexType_,ValueType_,true>(sz,sz,G,sz,"G=Y'*Q");
+
+            //check conditioning of the subspace restart strategy
+            //WARNING: We need to compute condition number of matrix G in ||.||_2. 
+            //Normally to compute these condition number we would perform a singular value 
+            //decomposition and have kappa(G) = max_singular_value/min_singular_value of G.
+            t1 =timer();
+            //if (use_magma) {
+            //    //Notice also that MAGMA does not have GPU interface to singular_value decomposition,
+            //    //but it does have one for the eigenvalue routine. We will take advantage of it:    
+            //    //Since G is symmetric we can also say that singular_value(G) = sqrt(eigenvalue(A'*A)) = eigenvalue(A), 
+            //    //therefore kappa(G) = max_eigenvalue_G/min_eigenvalue_G
+            //    //[W,E]=eig(H)
+            //    MAGMACHECK(magma_xsyevd_cond(sz, G, sz, h_E, h_wa, sz, h_work, lwork, h_iwork, liwork, &minfo));
+            //    kappa = log10(h_E[sz-1]/h_E[0])+1; 
+            //    //printf("cond=%f (%f/%f),  %f\n",h_E[sz-1]/h_E[0],h_E[sz-1],h_E[0],log10(h_E[sz-1]/h_E[0])+1);
+            //    //print_matrix<IndexType_,ValueType_,false>(sz,1,h_E,sz,"h_E, sing_values(G)=eig(G) in cond(G)");
+            //}
+            //else { 
+                if (sz > n*k) { //WARNING: using eigVecs_dev as a temporary space (for sz singular values)
+                    WARNING("lobpcg_simplified - temporary space insufficient (sz > n*k)");
+                    return -1;
+                }
+                CHECK_CUSOLVER(cusolverXgesvd_bufferSize(cusolverHandle,sz,sz,G,sz,HU,sz,HVT,sz,&Lwork)); //Workspace was already over allocated earlier
+                CHECK_CUSOLVER(cusolverXgesvd(cusolverHandle,sz,sz,G,sz,eigVecs_dev,HU,sz,HVT,sz,Workspace,Lwork,NULL,(int *)&Workspace[Lwork]));
+                compute_cond<IndexType_,ValueType_>(sz,eigVecs_dev,s_alg); //condition number is eigVecs_dev[0] = eigVecs_dev[0]/eigVecs_dev[sz-1]
+                cudaMemcpy(&kappa, eigVecs_dev, sizeof(ValueType_), cudaMemcpyDeviceToHost); cudaCheckError();//FIX LATER using throttle technique
+                kappa = log10(kappa)+1.0;
+                ///kappa =1;
+            //}
+            t2 =timer();
+            t_syevd+=t2-t1;
+            //printf("cond=%f\n", kappa);
+            //print_matrix<IndexType_,ValueType_,true>(sz,sz,G,sz,"G, should not have changed cond(G)");
+            
+
+            //WARNING: will compute average (not mean, like MATLAB code) because it is easier to code
+            start = max(0,i-10-((int)round(log(static_cast<float>(k)))));
+            kappa_average = zero;
+            for(j=start; j<=i; j++) {
+                //printf("%f ",h_kappa_history[j]);
+                kappa_average += h_kappa_history[j];
+            }
+            //printf("\n");
+            kappa_average = kappa_average/(i-start+1); 
+            if (((kappa/kappa_average) > 2 && (kappa > 2)) || (kappa > max_kappa)) {
+                //exclude P from Y=[X,R] 
+                sz = k2;
+                //printf("restart=%d (%d, %d, %d, %d) (%f %f %f)\n",i,(int)round(log(k)),i-10-((int)round(log(k))),start,i-start+1,kappa,kappa_average,max_kappa);
+                //recompute G=Y'*Q and corresponding condition number (excluding P)
+                t1 =timer();
+                CHECK_CUBLAS(cublasXgemm(cublasHandle,CUBLAS_OP_T,CUBLAS_OP_N, sz, sz, n, &one, Y, n, Q, n, &zero, G, sz));
+                t2 =timer();
+                t_bdot+=t2-t1;
+                //print_matrix<IndexType_,ValueType_,true>(sz,sz,G,sz,"G=Y'*Y");
+
+                t1 =timer();
+                //if (use_magma) {
+                //    MAGMACHECK(magma_xsyevd_cond(sz, G, sz, h_E, h_wa, sz, h_work, lwork, h_iwork, liwork, &minfo));
+                //    kappa = log10(h_E[sz-1]/h_E[0])+1;                 
+                //}
+                //else {
+                    if (sz > n*k) { //WARNING: using eigVecs_dev as a temporary space (for sz singular values)
+                        WARNING("lobpcg_simplified - temporary space insufficient (sz > n*k)");
+                        return -1;
+                    }
+                    CHECK_CUSOLVER(cusolverXgesvd_bufferSize(cusolverHandle,sz,sz,G,sz,HU,sz,HVT,sz,&Lwork)); //Workspace was already over allocated earlier
+                    CHECK_CUSOLVER(cusolverXgesvd(cusolverHandle,sz,sz,G,sz,eigVecs_dev,HU,sz,HVT,sz,Workspace,Lwork,NULL,(int *)&Workspace[Lwork]));
+                    compute_cond<IndexType_,ValueType_>(sz,eigVecs_dev,s_alg); //condition number is eigVecs_dev[0] = eigVecs_dev[0]/eigVecs_dev[sz-1]
+                    cudaMemcpy(&kappa, eigVecs_dev, sizeof(ValueType_), cudaMemcpyDeviceToHost); cudaCheckError(); //FIX LATER using throttle technique
+                    kappa = log10(kappa)+1.0;
+                    ///kappa =1;
+                //}    
+                t2 =timer();
+                t_syevd+=t2-t1;
+                //printf("cond=%f\n", kappa);
+                //print_matrix<IndexType_,ValueType_,false>(sz,1,h_E,sz,"h_E, sing_values(G)=eig(G) in cond(G)");
+                //print_matrix<IndexType_,ValueType_,true>(sz,sz,G,sz,"G, should not have changed cond(G)");
+            }
+            h_kappa_history[i+1] = kappa;
+
+            //WARNING: the computation of condition number destroys the  
+            //lower triangle of G (including diagonal), so it must be recomputed again.
+            //recompute G=Y'*Q 
+            t1 =timer(); 
+            CHECK_CUBLAS(cublasXgemm(cublasHandle,CUBLAS_OP_T,CUBLAS_OP_N, sz, sz, n, &one, Y, n, Q, n, &zero, G, sz));
+            t2 =timer();
+            t_bdot+=t2-t1;
+            //print_matrix<IndexType_,ValueType_,true>(sz,sz,G,sz,"G=Y'*Q (recomputing)");
+            
+            //AR = A*R        
+            t1 =timer();
+            L->mm(k, one, R, zero, AR);
+            t2 =timer();
+            t_mm+=t2-t1;
+            //print_matrix<IndexType_,ValueType_,true>(3,k,AR,n,"AR=A*R");
+
+            //H = Y'*Z
+            t1 =timer();
+            CHECK_CUBLAS(cublasXgemm(cublasHandle,CUBLAS_OP_T,CUBLAS_OP_N, sz, sz, n, &one, Y, n, Z, n, &zero, H, sz));
+            t2 =timer();
+            t_bdot+=t2-t1;
+            //print_matrix<IndexType_,ValueType_,true>(sz,sz,H,sz,"H=Y'*A*Y");
+
+            //Approach 1:
+            //S = chol(G);
+            t1 =timer();
+            //if (false /*use_magma*/) {
+            //    MAGMACHECK(magma_xpotrf(sz, G, sz, &minfo));
+            //}
+            //else{
+                CHECK_CUSOLVER(cusolverXpotrf_bufferSize(cusolverHandle,sz,G,sz,&Lwork)); //Workspace was over already over allocated earlier
+                CHECK_CUSOLVER(cusolverXpotrf(cusolverHandle,sz,G,sz,Workspace,Lwork,(int *)&Workspace[Lwork]));
+            //}
+            t2 =timer();
+            t_potrf+=t2-t1;
+            //print_matrix<IndexType_,ValueType_,true>(sz,sz,G,sz,"S=chol(G,lower_part_stored)");
+
+            //H = S'\ H /S (notice that in MATLAB S has L', therefore extra transpose (CUBLAS_OP_T) is required below)
+            t1 =timer();
+            CHECK_CUBLAS(cublasXtrsm(cublasHandle,CUBLAS_SIDE_RIGHT,CUBLAS_FILL_MODE_LOWER,CUBLAS_OP_T,CUBLAS_DIAG_NON_UNIT,sz,sz,&one,G,sz,H,sz));
+            CHECK_CUBLAS(cublasXtrsm(cublasHandle,CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER,CUBLAS_OP_N,CUBLAS_DIAG_NON_UNIT,sz,sz,&one,G,sz,H,sz));
+            t2 =timer();
+            t_trsm+=t2-t1;
+            //print_matrix<IndexType_,ValueType_,true>(sz,sz,H,sz,"H = S'\\ H /S");
+
+            //[W,E]=eig(S'\ H /S);
+            t1 =timer();
+            //if (use_magma) {
+            //    MAGMACHECK(magma_xsyevd(sz, H, sz, h_E, h_wa, sz, h_work, lwork, h_iwork, liwork, &minfo));
+            //    cudaMemcpy(E, h_E, k*sizeof(ValueType_), cudaMemcpyHostToDevice); cudaCheckError(); //only have k spaces in E, but h_E have sz eigs
+            //}
+            //else {
+                if (sz > n*k) { //WARNING: using eigVecs_dev as a temporary space (for sz singular values)
+                    WARNING("lobpcg_simplified - temporary space insufficient (sz > n*k)");
+                    return -1;
+                }
+                CHECK_CUSOLVER(cusolverXgesvd_bufferSize(cusolverHandle,sz,sz,H,sz,HU,sz,HVT,sz,&Lwork)); //Workspace was already over allocated earlier
+                CHECK_CUSOLVER(cusolverXgesvd(cusolverHandle,sz,sz,H,sz,eigVecs_dev,HU,sz,HVT,sz,Workspace,Lwork,NULL,(int *)&Workspace[Lwork]));
+                convert_to_ascending_order<IndexType_,ValueType_,true>(sz,H,sz,E,HU,sz,eigVecs_dev,s_alg);
+            //}
+            t2 =timer();
+            t_syevd+=t2-t1;
+            //print_matrix<IndexType_,ValueType_,false>(sz,1,h_E,sz,"h_E, from [W,E]=eig(S'\\ H /S)");
+            //print_matrix<IndexType_,ValueType_, true>(k,1,E,k,"E, smallest k eigs from [W,E]=eig(S'\\ H /S)");
+            //print_matrix<IndexType_,ValueType_, true>(sz,sz,H,sz,"W, from [W,E]=eig(S'\\ H /S)");
+
+            //W=S\W (recover original eigvectors)
+            t1 =timer();
+            CHECK_CUBLAS(cublasXtrsm(cublasHandle,CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER,CUBLAS_OP_T,CUBLAS_DIAG_NON_UNIT,sz,sz,&one,G,sz,H,sz));
+            t2 =timer();
+            t_trsm+=t2-t1;
+            //print_matrix<IndexType_,ValueType_,true>(sz,sz,H,sz,"W=S\\W");
+
+            //WARNING: using eigVecs_dev as a temporary space
+            //X =Y*W(:,1:k); //notice can not use X  for the result directly, because it is part of Y (and aliased by Y)
+            t1 =timer();
+            CHECK_CUBLAS(cublasXgemm(cublasHandle,CUBLAS_OP_N,CUBLAS_OP_N, n, k, sz, &one, Y, n, H, sz, &zero, eigVecs_dev, n));
+            cudaMemcpyAsync(X, eigVecs_dev, n*k*sizeof(ValueType_), cudaMemcpyDeviceToDevice, s_alg);  cudaCheckError();
+            //BX=Q*W(:,1:k); //notice can not use BX for the result directly, because it is part of Q (and aliased by Q)
+            CHECK_CUBLAS(cublasXgemm(cublasHandle,CUBLAS_OP_N,CUBLAS_OP_N, n, k, sz, &one, Q, n, H, sz, &zero, eigVecs_dev, n));
+            cudaMemcpyAsync(BX, eigVecs_dev, n*k*sizeof(ValueType_), cudaMemcpyDeviceToDevice, s_alg); cudaCheckError(); 
+            //AX=Z*W(:,1:k); //notice can not use AX for the result directly, because it is part of Z (and aliased by Z) 
+            CHECK_CUBLAS(cublasXgemm(cublasHandle,CUBLAS_OP_N,CUBLAS_OP_N, n, k, sz, &one, Z, n, H, sz, &zero, eigVecs_dev, n));
+            cudaMemcpyAsync(AX, eigVecs_dev, n*k*sizeof(ValueType_), cudaMemcpyDeviceToDevice, s_alg); cudaCheckError(); 
+            t2 =timer();
+            t_gemm+=t2-t1;
+            //print_matrix<IndexType_,ValueType_,true>(3,3, X,n,"X =Y*W(:,1:k)");
+            //print_matrix<IndexType_,ValueType_,true>(3,3,BX,n,"BX=Q*W(:,1:k)");
+            //print_matrix<IndexType_,ValueType_,true>(3,3,AX,n,"AX=Z*W(:,1:k)");
+
+            //update P
+            t1 =timer();
+            if (sz == k2) {
+                //P = R*W(k+1:2*k,1:k);
+                CHECK_CUBLAS(cublasXgemm(cublasHandle,CUBLAS_OP_N,CUBLAS_OP_N, n, k, k, &one, R, n, &H[k], sz, &zero, P, n));
+                //BP=BR*W(k+1:2*k,1:k);
+                CHECK_CUBLAS(cublasXgemm(cublasHandle,CUBLAS_OP_N,CUBLAS_OP_N, n, k, k, &one,BR, n, &H[k], sz, &zero,BP, n));
+                //AP=AR*W(k+1:2*k,1:k);
+                CHECK_CUBLAS(cublasXgemm(cublasHandle,CUBLAS_OP_N,CUBLAS_OP_N, n, k, k, &one,AR, n, &H[k], sz, &zero,AP, n));              
+                //print_matrix<IndexType_,ValueType_,true>(3,3, P,n,"P = R*W(k+1:2*k,1:k)");
+                //print_matrix<IndexType_,ValueType_,true>(3,3,BP,n,"BP=BR*W(k+1:2*k,1:k)");
+                //print_matrix<IndexType_,ValueType_,true>(3,3,AP,n,"AP=AR*W(k+1:2*k,1:k)");
+            }
+            else { //(sz == k3) 
+                //P= R*W(k+1:2*k,1:k) +  P*W(2*k+1:3*k,1:k); and recall that Y = [X,R,P]
+                CHECK_CUBLAS(cublasXgemm(cublasHandle,CUBLAS_OP_N,CUBLAS_OP_N, n, k, k2, &one, &Y[n*k], n, &H[k], sz, &zero, eigVecs_dev, n));
+                cudaMemcpyAsync(P, eigVecs_dev, n*k*sizeof(ValueType_), cudaMemcpyDeviceToDevice, s_alg);cudaCheckError();
+                //BP=BR*W(k+1:2*k,1:k) + BP*W(2*k+1:3*k,1:k); and recall that Q = [BX,BR,BP]
+                CHECK_CUBLAS(cublasXgemm(cublasHandle,CUBLAS_OP_N,CUBLAS_OP_N, n, k, k2, &one, &Q[n*k], n, &H[k], sz, &zero, eigVecs_dev, n));
+                cudaMemcpyAsync(BP, eigVecs_dev, n*k*sizeof(ValueType_), cudaMemcpyDeviceToDevice, s_alg);cudaCheckError();
+                //AP=AR*W(k+1:2*k,1:k) + AP*W(2*k+1:3*k,1:k); and recall that Z = [AX,AR,AP]
+                CHECK_CUBLAS(cublasXgemm(cublasHandle,CUBLAS_OP_N,CUBLAS_OP_N, n, k, k2, &one, &Z[n*k], n, &H[k], sz, &zero, eigVecs_dev, n));
+                cudaMemcpyAsync(AP, eigVecs_dev, n*k*sizeof(ValueType_), cudaMemcpyDeviceToDevice, s_alg);cudaCheckError();
+                //print_matrix<IndexType_,ValueType_,true>(3,3, P,n,"P = R*W(k+1:2*k,1:k) +  P*W(2*k+1:3*k,1:k)");
+                //print_matrix<IndexType_,ValueType_,true>(3,3,BP,n,"BP=BR*W(k+1:2*k,1:k) + BP*W(2*k+1:3*k,1:k)");
+                //print_matrix<IndexType_,ValueType_,true>(3,3,AP,n,"AP=AR*W(k+1:2*k,1:k) + AP*W(2*k+1:3*k,1:k)");
+            }
+            t2 =timer();
+            t_gemm+=t2-t1;
+
+            //orthonormalize P
+            //G = P'*BP
+            t1 =timer();
+            CHECK_CUBLAS(cublasXgemm(cublasHandle,CUBLAS_OP_T,CUBLAS_OP_N, k, k, n, &one, P, n, BP, n, &zero, G, k));
+            t2 =timer();
+            t_bdot+=t2-t1;
+            //print_matrix<IndexType_,ValueType_,true>(k,k,G,k,"G=P'*BP");
+
+            //S = chol(G);
+            t1 =timer();
+            //if (false /*use_magma*/) {
+            //    MAGMACHECK(magma_xpotrf(k, G, k, &minfo));
+            //}
+            //else{
+                CHECK_CUSOLVER(cusolverXpotrf_bufferSize(cusolverHandle,k,G,k,&Lwork)); //Workspace was already over allocated earlier
+                CHECK_CUSOLVER(cusolverXpotrf(cusolverHandle,k,G,k,Workspace,Lwork,(int *)&Workspace[Lwork]));
+            //}
+            t2 =timer();
+            t_potrf+=t2-t1;
+            //print_matrix<IndexType_,ValueType_,true>(k,k,G,k,"S=chol(G,lower_part_stored)");
+
+            //P  =  P/S (notice that in MATLAB S has L', therefore extra transpose (CUBLAS_OP_T) is required below)
+            t1 =timer();
+            CHECK_CUBLAS(cublasXtrsm(cublasHandle,CUBLAS_SIDE_RIGHT,CUBLAS_FILL_MODE_LOWER,CUBLAS_OP_T,CUBLAS_DIAG_NON_UNIT,n,k,&one,G,k,P,n));
+            //BP = BP/S 
+            CHECK_CUBLAS(cublasXtrsm(cublasHandle,CUBLAS_SIDE_RIGHT,CUBLAS_FILL_MODE_LOWER,CUBLAS_OP_T,CUBLAS_DIAG_NON_UNIT,n,k,&one,G,k,BP,n));
+            //AP = AP/S 
+            CHECK_CUBLAS(cublasXtrsm(cublasHandle,CUBLAS_SIDE_RIGHT,CUBLAS_FILL_MODE_LOWER,CUBLAS_OP_T,CUBLAS_DIAG_NON_UNIT,n,k,&one,G,k,AP,n));
+            t2 =timer();
+            t_trsm+=t2-t1;
+            //print_matrix<IndexType_,ValueType_,true>(3,3, P,n,"P = P/S");
+            //print_matrix<IndexType_,ValueType_,true>(3,3,BP,n,"BP=BP/S");
+            //print_matrix<IndexType_,ValueType_,true>(3,3,AP,n,"AP=AP/S");
+
+            //copy AX into R (to satisfy assumption in the next iteration)
+            cudaMemcpyAsync(R, AX, n*k*sizeof(ValueType_), cudaMemcpyDeviceToDevice, s_alg);cudaCheckError(); 
+            //reset sz for the next iteration
+            sz=k3;
+            //printf("--- %d ---\n",i);
+        }
+        t_end =timer();
+        t_total+=t_end-t_start;
+
+        //WARNING: In the MATLAB code at this point X is made a section of A,
+        //which I don't think is necessary, but something to keep in mind,
+        //in case something goes wrong in the future.
+        cudaMemcpyAsync(eigVecs_dev, X, n*k*sizeof(ValueType_), cudaMemcpyDeviceToDevice, s_alg); cudaCheckError();
+
+        //free temporary host memory
+        cudaStreamSynchronize(s_alg); cudaCheckError();
+        //if (use_magma) {
+        //    if (h_E) free(h_E);
+        //    if (h_wa) free(h_wa);
+        //    if (h_work) free(h_work);
+        //    if (h_iwork) free(h_iwork);
+        //}
+        if(use_throttle) {
+            cudaFreeHost(h_nrmR);cudaCheckError(); //pinned
+        }
+        else {
+            if (h_nrmR) free(h_nrmR);
+        }
+        if (h_kappa_history) free(h_kappa_history);
+        cudaEventDestroy(event);cudaCheckError();
+        if (s_alg) {cudaStreamDestroy(s_alg);cudaCheckError();}
+        //revert CUBLAS and CUSOLVER/MAGMA streams
+        CHECK_CUBLAS(cublasSetStream(cublasHandle, s_cublas));
+        //if (use_magma) {
+        //    CHECK_CUBLAS(magmablasSetKernelStream(s_magma)); //returns cublasStatus_t
+        //}
+        //else {
+            CHECK_CUSOLVER(cusolverDnSetStream(cusolverHandle, s_cusolver));
+        //}
+        //revert Laplacian/CUSPARSE streams     
+        L->setCUDAStream(s_cusparse);
+    
+#ifdef COLLECT_TIME_STATISTICS
+        //timing statistics
+        printf("-------------------------\n");
+        printf("time eigsolver [total] %f\n",t_total);
+        printf("time eigsolver [L->pr] %f\n",t_prec);
+        printf("time eigsolver [potrf] %f\n",t_potrf);
+        printf("time eigsolver [syevd] %f\n",t_syevd);
+        printf("time eigsolver [trsm]  %f\n",t_trsm);
+        printf("time eigsolver [bdot]  %f\n",t_bdot);
+        printf("time eigsolver [gemm]  %f\n",t_gemm);
+        printf("time eigsolver [L->mm] %f\n",t_mm);
+        printf("time eigsolver [custom]%f\n",t_custom);
+        printf("time eigsolver [setup] %f\n",t_setup);
+        printf("time eigsolver [other] %f\n",t_total-(t_prec+t_potrf+t_syevd+t_trsm+t_bdot+t_gemm+t_mm+t_custom+t_setup));
+#endif        
+        return 0;
+    }
+
+    // =========================================================
+    // Explicit instantiation
+    // =========================================================
+
+    template int lobpcg_simplified<int,float>
+    (cublasHandle_t cublasHandle, cusolverDnHandle_t cusolverHandle,
+     int n, int k,
+     /*const*/ Matrix<int,float> * A,
+     float * __restrict__ eigVecs_dev,
+     float * __restrict__ eigVals_dev,
+     int maxIter, float tol,
+     float * __restrict__ work_dev,
+     int &iter); 
+
+    template int lobpcg_simplified<int,double>
+    (cublasHandle_t cublasHandle, cusolverDnHandle_t cusolverHandle,
+     int n, int k,
+     /*const*/ Matrix<int,double> * A,
+     double * __restrict__ eigVecs_dev,
+     double * __restrict__ eigVals_dev,
+     int maxIter, double tol,
+     double * __restrict__ work_dev,
+     int &iter);
+
+}
+//#endif //enable/disable lobpcg
+
diff --git a/cpp/nvgraph/cpp/src/matrix.cu b/cpp/nvgraph/cpp/src/matrix.cu
new file mode 100644
index 00000000000..4d7c73be9b3
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/matrix.cu
@@ -0,0 +1,663 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+//#ifdef NVGRAPH_PARTITION
+//#ifdef DEBUG
+
+#include "matrix.hxx"
+
+#include <thrust/device_vector.h>
+#include <thrust/transform.h>
+
+#include "nvgraph_error.hxx"
+#include "nvgraph_vector.hxx"
+#include "nvgraph_cublas.hxx"
+#include "nvgraph_cusparse.hxx"
+#include "debug_macros.h"
+
+// =========================================================
+// Useful macros
+// =========================================================
+
+// CUDA block size
+#define BLOCK_SIZE 1024
+
+// Get index of matrix entry
+#define IDX(i,j,lda) ((i)+(j)*(lda))
+
+namespace nvgraph {
+
+  // =============================================
+  // CUDA kernels
+  // =============================================
+
+  namespace {
+
+    /// Apply diagonal matrix to vector
+    template <typename IndexType_, typename ValueType_> static __global__
+    void diagmv(IndexType_ n, ValueType_ alpha,
+    const ValueType_ * __restrict__ D,
+    const ValueType_ * __restrict__ x,
+    ValueType_ * __restrict__ y) {
+      IndexType_ i = threadIdx.x + blockIdx.x*blockDim.x;
+      while(i<n) {
+  y[i] += alpha*D[i]*x[i];
+  i += blockDim.x*gridDim.x;
+      }
+    }
+
+    /// Apply diagonal matrix to a set of dense vectors (tall matrix)
+    template <typename IndexType_, typename ValueType_, bool beta_is_zero> 
+    static __global__  void diagmm(IndexType_ n, IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ D, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) {
+        IndexType_ i,j,index;
+       
+        for(j=threadIdx.y+blockIdx.y*blockDim.y; j<k; j+=blockDim.y*gridDim.y) {
+            for(i=threadIdx.x+blockIdx.x*blockDim.x; i<n; i+=blockDim.x*gridDim.x) {
+                index = i+j*n;
+                if (beta_is_zero) {
+                    y[index] = alpha*D[i]*x[index];
+                }
+                else {
+                    y[index] = alpha*D[i]*x[index] + beta*y[index];
+                }
+            }
+        }
+    }
+  }
+
+  // =============================================
+  // Dense matrix class
+  // =============================================
+
+  /// Constructor for dense matrix class
+  /** @param _trans Whether to transpose matrix.
+   *  @param _m Number of rows.
+   *  @param _n Number of columns.
+   *  @param _A (Input, device memory, _m*_n entries) Matrix
+   *    entries, stored column-major.
+   *  @param _lda Leading dimension of _A.
+   */
+  template <typename IndexType_, typename ValueType_>
+  DenseMatrix<IndexType_,ValueType_>
+  ::DenseMatrix(bool _trans,
+    IndexType_ _m, IndexType_ _n,
+    const ValueType_ * _A, IndexType_ _lda) 
+    : Matrix<IndexType_,ValueType_>(_m,_n),
+      trans(_trans), A(_A), lda(_lda) {
+    Cublas::set_pointer_mode_host();
+    if(_lda<_m)
+      FatalError("invalid dense matrix parameter (lda<m)",
+     NVGRAPH_ERR_BAD_PARAMETERS);
+  }
+
+  /// Destructor for dense matrix class
+  template <typename IndexType_, typename ValueType_>
+  DenseMatrix<IndexType_,ValueType_>::~DenseMatrix() {}
+
+   /// Get and Set CUDA stream    
+  template <typename IndexType_, typename ValueType_>
+  void DenseMatrix<IndexType_,ValueType_>
+  ::setCUDAStream(cudaStream_t _s) {
+      this->s = _s;
+      //printf("DenseMatrix setCUDAStream stream=%p\n",this->s);
+      Cublas::setStream(_s);
+  }  
+  template <typename IndexType_, typename ValueType_>
+  void DenseMatrix<IndexType_,ValueType_>
+  ::getCUDAStream(cudaStream_t *_s) {
+      *_s = this->s;
+      //CHECK_CUBLAS(cublasGetStream(cublasHandle, _s));
+  }  
+
+
+  /// Matrix-vector product for dense matrix class
+  /** y is overwritten with alpha*A*x+beta*y.
+   *
+   *  @param alpha Scalar.
+   *  @param x (Input, device memory, n entries) Vector.
+   *  @param beta Scalar.
+   *  @param y (Input/output, device memory, m entries) Output vector.
+   */
+  template <typename IndexType_, typename ValueType_>
+  void DenseMatrix<IndexType_,ValueType_>
+  ::mv(ValueType_ alpha, const ValueType_ * __restrict__ x,
+       ValueType_ beta, ValueType_ * __restrict__ y) const {
+    Cublas::gemv(this->trans, this->m, this->n,
+     &alpha, this->A, this->lda, x, 1, &beta, y, 1);
+  }
+
+  template <typename IndexType_, typename ValueType_>
+  void DenseMatrix<IndexType_,ValueType_>
+  ::mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x,
+       ValueType_ beta, ValueType_ * __restrict__ y) const {
+      Cublas::gemm(this->trans, false, this->m, k, this->n,
+          &alpha, A, lda, x, this->m, &beta, y, this->n);
+  }  
+
+  /// Color and Reorder
+  template <typename IndexType_, typename ValueType_>
+  void DenseMatrix<IndexType_,ValueType_>
+  ::color(IndexType_ *c, IndexType_ *p) const {
+      
+  } 
+
+  template <typename IndexType_, typename ValueType_>
+  void DenseMatrix<IndexType_,ValueType_>
+  ::reorder(IndexType_ *p) const {
+
+  }  
+
+  /// Incomplete Cholesky (setup, factor and solve)
+  template <typename IndexType_, typename ValueType_>
+  void DenseMatrix<IndexType_,ValueType_>
+  ::prec_setup(Matrix<IndexType_,ValueType_> * _M) {
+      printf("ERROR: DenseMatrix prec_setup dispacthed\n");
+      //exit(1);
+  }
+  
+  template <typename IndexType_, typename ValueType_>
+  void DenseMatrix<IndexType_,ValueType_>
+  ::prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const {
+      printf("ERROR: DenseMatrix prec_solve dispacthed\n");
+      //exit(1);
+  }   
+
+  template <typename IndexType_, typename ValueType_>
+  ValueType_ DenseMatrix<IndexType_, ValueType_>
+  ::getEdgeSum() const {
+  return 0.0;  
+  }  
+
+  // =============================================
+  // CSR matrix class
+  // =============================================
+
+  /// Constructor for CSR matrix class
+  /** @param _transA Whether to transpose matrix.
+   *  @param _m Number of rows.
+   *  @param _n Number of columns.
+   *  @param _nnz Number of non-zero entries.
+   *  @param _descrA Matrix properties.
+   *  @param _csrValA (Input, device memory, _nnz entries) Matrix
+   *    entry values.
+   *  @param _csrRowPtrA (Input, device memory, _m+1 entries) Pointer
+   *    to first entry in each row.
+   *  @param _csrColIndA (Input, device memory, _nnz entries) Column
+   *    index of each matrix entry.
+   */
+  template <typename IndexType_, typename ValueType_>
+  CsrMatrix<IndexType_,ValueType_>
+  ::CsrMatrix(bool _trans, bool _sym,
+        IndexType_ _m, IndexType_ _n, IndexType_ _nnz,
+        const cusparseMatDescr_t _descrA,
+        /*const*/ ValueType_ * _csrValA,
+        const IndexType_ * _csrRowPtrA,
+        const IndexType_ * _csrColIndA) 
+    : Matrix<IndexType_,ValueType_>(_m,_n),
+      trans(_trans), sym(_sym),
+      nnz(_nnz),  descrA(_descrA), csrValA(_csrValA),
+      csrRowPtrA(_csrRowPtrA), 
+      csrColIndA(_csrColIndA) {
+    if(nnz<0)
+      FatalError("invalid CSR matrix parameter (nnz<0)",
+     NVGRAPH_ERR_BAD_PARAMETERS);
+    Cusparse::set_pointer_mode_host();
+  }
+
+  /// Constructor for CSR matrix class
+  /** @param G Weighted graph in CSR format
+   */
+  template <typename IndexType_, typename ValueType_>
+  CsrMatrix<IndexType_,ValueType_>
+  ::CsrMatrix(  ValuedCsrGraph<IndexType_,ValueType_> & G, const cusparseMatDescr_t _descrA)
+    : Matrix<IndexType_,ValueType_>(G.get_num_vertices(), G.get_num_vertices()),
+      trans(false), sym(false),
+      nnz(G.get_num_edges()),
+      descrA(_descrA), 
+      csrValA(G.get_raw_values()),
+      csrRowPtrA(G.get_raw_row_offsets()),
+      csrColIndA(G.get_raw_column_indices()) {
+    Cusparse::set_pointer_mode_host();
+  }
+
+  /// Destructor for CSR matrix class
+  template <typename IndexType_, typename ValueType_>
+  CsrMatrix<IndexType_,ValueType_>::~CsrMatrix() {}
+
+  /// Get and Set CUDA stream    
+  template <typename IndexType_, typename ValueType_>
+  void CsrMatrix<IndexType_,ValueType_>
+  ::setCUDAStream(cudaStream_t _s) {
+      this->s = _s;
+      //printf("CsrMatrix setCUDAStream stream=%p\n",this->s);
+      Cusparse::setStream(_s);
+  }  
+  template <typename IndexType_, typename ValueType_>
+  void CsrMatrix<IndexType_,ValueType_>
+  ::getCUDAStream(cudaStream_t *_s) {
+      *_s = this->s;
+      //CHECK_CUSPARSE(cusparseGetStream(Cusparse::get_handle(), _s));
+  }     
+   template <typename IndexType_, typename ValueType_>
+  void CsrMatrix<IndexType_,ValueType_>
+  ::mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const {
+      //CHECK_CUSPARSE(cusparseXcsrmm(Cusparse::get_handle(), transA, this->m, k, this->n, nnz, &alpha, descrA, csrValA, csrRowPtrA, csrColIndA, x, this->n, &beta, y, this->m));
+      Cusparse::csrmm(this->trans, this->sym, this->m, k, this->n, this->nnz, &alpha, this->csrValA, this->csrRowPtrA, this->csrColIndA, x, this->n, &beta, y, this->m);
+  }
+
+  /// Color and Reorder
+  template <typename IndexType_, typename ValueType_>
+  void CsrMatrix<IndexType_,ValueType_>
+  ::color(IndexType_ *c, IndexType_ *p) const {
+      
+  } 
+
+  template <typename IndexType_, typename ValueType_>
+  void CsrMatrix<IndexType_,ValueType_>
+  ::reorder(IndexType_ *p) const {
+
+  }  
+
+  /// Incomplete Cholesky (setup, factor and solve)
+  template <typename IndexType_, typename ValueType_>
+  void CsrMatrix<IndexType_,ValueType_>
+  ::prec_setup(Matrix<IndexType_,ValueType_> * _M) {
+      //printf("CsrMatrix prec_setup dispacthed\n");
+      if (!factored) {
+          //analyse lower triangular factor
+          CHECK_CUSPARSE(cusparseCreateSolveAnalysisInfo(&info_l));
+          CHECK_CUSPARSE(cusparseSetMatFillMode(descrA,CUSPARSE_FILL_MODE_LOWER));
+          CHECK_CUSPARSE(cusparseSetMatDiagType(descrA,CUSPARSE_DIAG_TYPE_UNIT));
+          CHECK_CUSPARSE(cusparseXcsrsm_analysis(Cusparse::get_handle(),CUSPARSE_OPERATION_NON_TRANSPOSE,this->m,nnz,descrA,csrValA,csrRowPtrA,csrColIndA,info_l));
+          //analyse upper triangular factor
+          CHECK_CUSPARSE(cusparseCreateSolveAnalysisInfo(&info_u));
+          CHECK_CUSPARSE(cusparseSetMatFillMode(descrA,CUSPARSE_FILL_MODE_UPPER));
+          CHECK_CUSPARSE(cusparseSetMatDiagType(descrA,CUSPARSE_DIAG_TYPE_NON_UNIT));
+          CHECK_CUSPARSE(cusparseXcsrsm_analysis(Cusparse::get_handle(),CUSPARSE_OPERATION_NON_TRANSPOSE,this->m,nnz,descrA,csrValA,csrRowPtrA,csrColIndA,info_u));
+          //perform csrilu0 (should be slightly faster than csric0)
+          CHECK_CUSPARSE(cusparseXcsrilu0(Cusparse::get_handle(),CUSPARSE_OPERATION_NON_TRANSPOSE,this->m,descrA,csrValA,csrRowPtrA,csrColIndA,info_l));
+          //set factored flag to true
+          factored=true;
+      }
+  }
+  
+  template <typename IndexType_, typename ValueType_>
+  void CsrMatrix<IndexType_,ValueType_>
+  ::prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const {
+      //printf("CsrMatrix prec_solve dispacthed (stream %p)\n",this->s);
+      
+      //preconditioning Mx=f (where M = L*U, threfore x=U\(L\f))
+      //solve lower triangular factor
+      CHECK_CUSPARSE(cusparseSetMatFillMode(descrA,CUSPARSE_FILL_MODE_LOWER));
+      CHECK_CUSPARSE(cusparseSetMatDiagType(descrA,CUSPARSE_DIAG_TYPE_UNIT));
+      CHECK_CUSPARSE(cusparseXcsrsm_solve(Cusparse::get_handle(),CUSPARSE_OPERATION_NON_TRANSPOSE,this->m,k,alpha,descrA,csrValA,csrRowPtrA,csrColIndA,info_l,fx,this->m,t,this->m));
+      //solve upper triangular factor
+      CHECK_CUSPARSE(cusparseSetMatFillMode(descrA,CUSPARSE_FILL_MODE_UPPER));
+      CHECK_CUSPARSE(cusparseSetMatDiagType(descrA,CUSPARSE_DIAG_TYPE_NON_UNIT));
+      CHECK_CUSPARSE(cusparseXcsrsm_solve(Cusparse::get_handle(),CUSPARSE_OPERATION_NON_TRANSPOSE,this->m,k,alpha,descrA,csrValA,csrRowPtrA,csrColIndA,info_u,t,this->m,fx,this->m));
+      
+  } 
+
+  /// Matrix-vector product for CSR matrix class
+  /** y is overwritten with alpha*A*x+beta*y.
+   *
+   *  @param alpha Scalar.
+   *  @param x (Input, device memory, n entries) Vector.
+   *  @param beta Scalar.
+   *  @param y (Input/output, device memory, m entries) Output vector.
+   */
+  template <typename IndexType_, typename ValueType_>
+  void CsrMatrix<IndexType_,ValueType_>
+  ::mv(ValueType_ alpha, const ValueType_ * __restrict__ x,
+       ValueType_ beta, ValueType_ * __restrict__ y) const {
+    // TODO: consider using merge-path csrmv
+    Cusparse::csrmv(this->trans, this->sym, this->m, this->n,
+        this->nnz, &alpha, this->csrValA,
+        this->csrRowPtrA, this->csrColIndA,
+        x, &beta, y);
+
+  }
+
+  template <typename IndexType_, typename ValueType_>
+  ValueType_ CsrMatrix<IndexType_, ValueType_>
+  ::getEdgeSum() const {
+  return 0.0;  
+  }  
+
+  // =============================================
+  // Laplacian matrix class
+  // =============================================
+
+  /// Constructor for Laplacian matrix class
+  /** @param A Adjacency matrix
+   */
+  template <typename IndexType_, typename ValueType_>
+  LaplacianMatrix<IndexType_, ValueType_>
+  ::LaplacianMatrix(/*const*/ Matrix<IndexType_,ValueType_> & _A)
+    : Matrix<IndexType_,ValueType_>(_A.m,_A.n), A(&_A) {
+
+    // Check that adjacency matrix is square
+    if(_A.m != _A.n)
+      FatalError("cannot construct Laplacian matrix from non-square adjacency matrix",
+     NVGRAPH_ERR_BAD_PARAMETERS);
+    //set CUDA stream
+    this->s = NULL;
+    // Construct degree matrix
+    D.allocate(_A.m,this->s);
+    Vector<ValueType_> ones(this->n,this->s);
+    ones.fill(1.0);
+    _A.mv(1, ones.raw(), 0, D.raw());
+
+     // Set preconditioning matrix pointer to NULL
+    M=NULL;
+  }
+
+  /// Destructor for Laplacian matrix class
+  template <typename IndexType_, typename ValueType_>
+  LaplacianMatrix<IndexType_, ValueType_>::~LaplacianMatrix() {}
+  
+  /// Get and Set CUDA stream     
+  template <typename IndexType_, typename ValueType_>
+  void LaplacianMatrix<IndexType_, ValueType_>::setCUDAStream(cudaStream_t _s) {
+      this->s = _s;
+      //printf("LaplacianMatrix setCUDAStream stream=%p\n",this->s);
+      A->setCUDAStream(_s);
+      if (M != NULL) {
+          M->setCUDAStream(_s);
+      }
+  }  
+  template <typename IndexType_, typename ValueType_>
+  void LaplacianMatrix<IndexType_, ValueType_>::getCUDAStream(cudaStream_t * _s) {
+      *_s = this->s;
+      //A->getCUDAStream(_s);
+  }  
+
+  /// Matrix-vector product for Laplacian matrix class
+  /** y is overwritten with alpha*A*x+beta*y.
+   *
+   *  @param alpha Scalar.
+   *  @param x (Input, device memory, n entries) Vector.
+   *  @param beta Scalar.
+   *  @param y (Input/output, device memory, m entries) Output vector.
+   */
+  template <typename IndexType_, typename ValueType_>
+  void LaplacianMatrix<IndexType_, ValueType_>
+  ::mv(ValueType_ alpha, const ValueType_ * __restrict__ x,
+       ValueType_ beta, ValueType_ * __restrict__ y) const {
+
+    // Scale result vector
+    if(beta==0)
+      CHECK_CUDA(cudaMemset(y, 0, (this->n)*sizeof(ValueType_)))
+    else if(beta!=1)
+      thrust::transform(thrust::device_pointer_cast(y),
+      thrust::device_pointer_cast(y+this->n),
+      thrust::make_constant_iterator(beta),
+      thrust::device_pointer_cast(y),
+      thrust::multiplies<ValueType_>());
+    
+    // Apply diagonal matrix
+    dim3 gridDim, blockDim;
+    gridDim.x  = min(((this->n)+BLOCK_SIZE-1)/BLOCK_SIZE, 65535);
+    gridDim.y  = 1;
+    gridDim.z  = 1;
+    blockDim.x = BLOCK_SIZE;
+    blockDim.y = 1;
+    blockDim.z = 1;
+    diagmv <<< gridDim, blockDim , 0, A->s>>> (this->n, alpha, D.raw(), x, y);
+    cudaCheckError();
+
+    // Apply adjacency matrix
+    A->mv(-alpha, x, 1, y);
+    
+  }
+    /// Matrix-vector product for Laplacian matrix class
+  /** y is overwritten with alpha*A*x+beta*y.
+   *
+   *  @param alpha Scalar.
+   *  @param x (Input, device memory, n*k entries) nxk dense matrix.
+   *  @param beta Scalar.
+   *  @param y (Input/output, device memory, m*k entries) Output mxk dense matrix.
+   */
+  template <typename IndexType_, typename ValueType_>
+  void LaplacianMatrix<IndexType_, ValueType_>
+  ::mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x,
+       ValueType_ beta, ValueType_ * __restrict__ y) const {
+      // Apply diagonal matrix
+      ValueType_ one = (ValueType_)1.0;
+      this->dm(k,alpha,x,beta,y);     
+
+      // Apply adjacency matrix
+      A->mm(k, -alpha, x, one, y);      
+  }
+
+  template <typename IndexType_, typename ValueType_>
+  void LaplacianMatrix<IndexType_, ValueType_>
+  ::dm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const {
+      IndexType_ t = k*(this->n);
+      dim3 gridDim, blockDim;
+
+      //setup launch parameters
+      gridDim.x  = min(((this->n)+BLOCK_SIZE-1)/BLOCK_SIZE, 65535);
+      gridDim.y  = min(k,65535);
+      gridDim.z  = 1;
+      blockDim.x = BLOCK_SIZE;
+      blockDim.y = 1;
+      blockDim.z = 1;
+
+      // Apply diagonal matrix
+      if(beta == 0.0) {
+          //set vectors to 0 (WARNING: notice that you need to set, not scale, because of NaNs corner case)
+          CHECK_CUDA(cudaMemset(y, 0, t*sizeof(ValueType_)));
+          diagmm<IndexType_,ValueType_,true> <<< gridDim, blockDim, 0, A->s >>> (this->n, k, alpha, D.raw(), x, beta, y);
+      }
+      else {
+          diagmm<IndexType_,ValueType_,false><<< gridDim, blockDim, 0, A->s >>> (this->n, k, alpha, D.raw(), x, beta, y);
+      }
+      cudaCheckError();
+  }
+
+
+  /// Color and Reorder
+  template <typename IndexType_, typename ValueType_>
+  void LaplacianMatrix<IndexType_,ValueType_>
+  ::color(IndexType_ *c, IndexType_ *p) const {
+      
+  } 
+
+  template <typename IndexType_, typename ValueType_>
+  void LaplacianMatrix<IndexType_,ValueType_>
+  ::reorder(IndexType_ *p) const {
+
+  }    
+
+  /// Solve preconditioned system M x = f for a set of k vectors 
+  template <typename IndexType_, typename ValueType_>
+  void LaplacianMatrix<IndexType_, ValueType_>
+  ::prec_setup(Matrix<IndexType_,ValueType_> * _M) {
+      //save the pointer to preconditioner M
+      M = _M;
+      if (M != NULL) {
+          //setup the preconditioning matrix M
+          M->prec_setup(NULL);
+      }
+  }  
+
+  template <typename IndexType_, typename ValueType_>
+  void LaplacianMatrix<IndexType_, ValueType_>
+  ::prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const {
+      if (M != NULL) {
+          //preconditioning
+          M->prec_solve(k,alpha,fx,t);
+      }
+  }   
+
+  template <typename IndexType_, typename ValueType_>
+  ValueType_ LaplacianMatrix<IndexType_, ValueType_>
+  ::getEdgeSum() const {
+  return 0.0;  
+  }  
+// =============================================
+  // Modularity matrix class
+  // =============================================
+
+  /// Constructor for Modularity matrix class
+  /** @param A Adjacency matrix
+   */
+  template <typename IndexType_, typename ValueType_>
+  ModularityMatrix<IndexType_, ValueType_>
+  ::ModularityMatrix(/*const*/ Matrix<IndexType_,ValueType_> & _A, IndexType_ _nnz)
+    : Matrix<IndexType_,ValueType_>(_A.m,_A.n), A(&_A), nnz(_nnz){
+
+    // Check that adjacency matrix is square
+    if(_A.m != _A.n)
+      FatalError("cannot construct Modularity matrix from non-square adjacency matrix",
+     NVGRAPH_ERR_BAD_PARAMETERS);
+
+    //set CUDA stream
+    this->s = NULL;
+    // Construct degree matrix
+    D.allocate(_A.m,this->s);
+    Vector<ValueType_> ones(this->n,this->s);
+    ones.fill(1.0);
+    _A.mv(1, ones.raw(), 0, D.raw());
+     // D.dump(0,this->n);
+     edge_sum = D.nrm1();
+
+     // Set preconditioning matrix pointer to NULL
+    M=NULL;
+  }
+
+  /// Destructor for Modularity matrix class
+  template <typename IndexType_, typename ValueType_>
+  ModularityMatrix<IndexType_, ValueType_>::~ModularityMatrix() {}
+  
+  /// Get and Set CUDA stream     
+  template <typename IndexType_, typename ValueType_>
+  void ModularityMatrix<IndexType_, ValueType_>::setCUDAStream(cudaStream_t _s) {
+      this->s = _s;
+      //printf("ModularityMatrix setCUDAStream stream=%p\n",this->s);
+      A->setCUDAStream(_s);
+      if (M != NULL) {
+          M->setCUDAStream(_s);
+      }
+  }  
+
+  template <typename IndexType_, typename ValueType_>
+  void ModularityMatrix<IndexType_, ValueType_>::getCUDAStream(cudaStream_t * _s) {
+      *_s = this->s;
+      //A->getCUDAStream(_s);
+  }  
+
+  /// Matrix-vector product for Modularity matrix class
+  /** y is overwritten with alpha*A*x+beta*y.
+   *
+   *  @param alpha Scalar.
+   *  @param x (Input, device memory, n entries) Vector.
+   *  @param beta Scalar.
+   *  @param y (Input/output, device memory, m entries) Output vector.
+   */
+  template <typename IndexType_, typename ValueType_>
+  void ModularityMatrix<IndexType_, ValueType_>
+  ::mv(ValueType_ alpha, const ValueType_ * __restrict__ x,
+       ValueType_ beta, ValueType_ * __restrict__ y) const {
+
+    // Scale result vector
+    if(alpha!=1 || beta!=0)
+      FatalError("This isn't implemented for Modularity Matrix currently", NVGRAPH_ERR_NOT_IMPLEMENTED);
+
+     //CHECK_CUBLAS(cublasXdot(handle, this->n, const double *x, int incx, const double *y, int incy, double *result));
+    // y = A*x
+    A->mv(alpha, x, 0, y);
+     ValueType_  dot_res;
+    //gamma = d'*x
+    Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res);
+    // y = y -(gamma/edge_sum)*d
+    Cublas::axpy(this->n, -(dot_res/this->edge_sum), D.raw(), 1, y, 1);
+  }
+  /// Matrix-vector product for Modularity matrix class
+  /** y is overwritten with alpha*A*x+beta*y.
+   *
+   *  @param alpha Scalar.
+   *  @param x (Input, device memory, n*k entries) nxk dense matrix.
+   *  @param beta Scalar.
+   *  @param y (Input/output, device memory, m*k entries) Output mxk dense matrix.
+   */
+  template <typename IndexType_, typename ValueType_>
+  void ModularityMatrix<IndexType_, ValueType_>
+  ::mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x,
+       ValueType_ beta, ValueType_ * __restrict__ y) const {
+       FatalError("This isn't implemented for Modularity Matrix currently", NVGRAPH_ERR_NOT_IMPLEMENTED);
+  }
+
+  template <typename IndexType_, typename ValueType_>
+  void ModularityMatrix<IndexType_, ValueType_>
+  ::dm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const {
+       FatalError("This isn't implemented for Modularity Matrix currently", NVGRAPH_ERR_NOT_IMPLEMENTED);
+
+  }
+
+  /// Color and Reorder
+  template <typename IndexType_, typename ValueType_>
+  void ModularityMatrix<IndexType_,ValueType_>
+  ::color(IndexType_ *c, IndexType_ *p) const {
+    FatalError("This isn't implemented for Modularity Matrix currently", NVGRAPH_ERR_NOT_IMPLEMENTED);
+ 
+  } 
+
+  template <typename IndexType_, typename ValueType_>
+  void ModularityMatrix<IndexType_,ValueType_>
+  ::reorder(IndexType_ *p) const {
+    FatalError("This isn't implemented for Modularity Matrix currently", NVGRAPH_ERR_NOT_IMPLEMENTED);
+  }    
+
+  /// Solve preconditioned system M x = f for a set of k vectors 
+  template <typename IndexType_, typename ValueType_>
+  void ModularityMatrix<IndexType_, ValueType_>
+  ::prec_setup(Matrix<IndexType_,ValueType_> * _M) {
+      //save the pointer to preconditioner M
+      M = _M;
+      if (M != NULL) {
+          //setup the preconditioning matrix M
+          M->prec_setup(NULL);
+      }
+  }  
+
+  template <typename IndexType_, typename ValueType_>
+  void ModularityMatrix<IndexType_, ValueType_>
+  ::prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const {
+      if (M != NULL) {
+        FatalError("This isn't implemented for Modularity Matrix currently", NVGRAPH_ERR_NOT_IMPLEMENTED);
+      }
+  }   
+
+  template <typename IndexType_, typename ValueType_>
+  ValueType_ ModularityMatrix<IndexType_, ValueType_>
+  ::getEdgeSum() const {
+      return edge_sum;
+  }  
+  // Explicit instantiation
+  template class Matrix<int,float>;
+  template class Matrix<int, double>;
+  template class DenseMatrix<int,float>;
+  template class DenseMatrix<int,double>;
+  template class CsrMatrix<int,float>;
+  template class CsrMatrix<int,double>;
+  template class LaplacianMatrix<int,float>;
+  template class LaplacianMatrix<int,double>;
+  template class ModularityMatrix<int,float>;
+  template class ModularityMatrix<int,double>;
+
+}
+//#endif 
diff --git a/cpp/nvgraph/cpp/src/modularity_maximization.cu b/cpp/nvgraph/cpp/src/modularity_maximization.cu
new file mode 100644
index 00000000000..b322df72baf
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/modularity_maximization.cu
@@ -0,0 +1,594 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+//#ifdef NVGRAPH_PARTITION
+
+#include "modularity_maximization.hxx"
+
+#include <stdio.h>
+#include <math.h>
+
+#include <cuda.h>
+#include <thrust/device_vector.h>
+#include <thrust/fill.h>
+#include <thrust/reduce.h>
+#include <thrust/transform.h>
+
+#include "nvgraph_error.hxx"
+#include "nvgraph_vector.hxx"
+#include "nvgraph_cublas.hxx"
+#include "matrix.hxx"
+#include "lanczos.hxx"
+#include "kmeans.hxx"
+#include "debug_macros.h"
+#include "lobpcg.hxx"
+#include "sm_utils.h"
+
+//#define COLLECT_TIME_STATISTICS 1
+//#undef COLLECT_TIME_STATISTICS
+
+#ifdef COLLECT_TIME_STATISTICS
+#include <stddef.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/sysinfo.h>
+#include "cuda_profiler_api.h"
+#endif
+
+#ifdef COLLECT_TIME_STATISTICS
+static double timer (void) {
+    struct timeval tv;
+    cudaDeviceSynchronize();
+    gettimeofday(&tv, NULL);
+    return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
+}
+#endif        
+
+namespace nvgraph {
+
+  // =========================================================
+  // Useful macros
+  // =========================================================
+
+  // Get index of matrix entry
+#define IDX(i,j,lda) ((i)+(j)*(lda))
+
+//    namespace {
+//      /// Get string associated with NVGRAPH error flag
+//      static
+//      const char* nvgraphGetErrorString(NVGRAPH_ERROR e) {
+//	switch(e) {
+//	case NVGRAPH_OK:                  return "NVGRAPH_OK";
+//	case NVGRAPH_ERR_BAD_PARAMETERS:  return "NVGRAPH_ERR_BAD_PARAMETERS";
+//	case NVGRAPH_ERR_UNKNOWN:         return "NVGRAPH_ERR_UNKNOWN";
+//	case NVGRAPH_ERR_CUDA_FAILURE:    return "NVGRAPH_ERR_CUDA_FAILURE";
+//	case NVGRAPH_ERR_THRUST_FAILURE:  return "NVGRAPH_ERR_THRUST_FAILURE";
+//	case NVGRAPH_ERR_IO:              return "NVGRAPH_ERR_IO";
+//	case NVGRAPH_ERR_NOT_IMPLEMENTED: return "NVGRAPH_ERR_NOT_IMPLEMENTED";
+//	case NVGRAPH_ERR_NO_MEMORY:       return "NVGRAPH_ERR_NO_MEMORY";
+//	default:                       return "unknown NVGRAPH error";
+//	}
+//      }
+//    }
+
+     template <typename IndexType_, typename ValueType_, bool Device_, bool print_transpose>
+    static int print_matrix(IndexType_ m, IndexType_ n, ValueType_ * A, IndexType_ lda, const char *s){
+        IndexType_ i,j;
+        ValueType_ * h_A;
+
+        if (m > lda) {
+            WARNING("print_matrix - invalid parameter (m > lda)");
+            return -1;
+        }
+        if (Device_) {
+            h_A = (ValueType_ *)malloc(lda*n*sizeof(ValueType_));
+            if (!h_A) {
+                WARNING("print_matrix - malloc failed");
+                return -1;
+            }
+            cudaMemcpy(h_A, A, lda*n*sizeof(ValueType_), cudaMemcpyDeviceToHost); cudaCheckError()
+        }
+        else {
+            h_A = A;
+        }
+
+        printf("%s\n",s);
+        if(print_transpose){
+            for (j=0; j<n; j++) {
+                for (i=0; i<m; i++) { //assumption m<lda
+                    printf("%8.5f, ", h_A[i+j*lda]);
+                }
+                printf("\n");
+            }
+        }
+        else {
+            for (i=0; i<m; i++) { //assumption m<lda
+                for (j=0; j<n; j++) {
+                    printf("%8.5f, ", h_A[i+j*lda]);
+                }
+                printf("\n");
+            }
+        }
+
+        if (Device_) {
+            if (h_A) free(h_A);
+        }
+        return 0;
+    }
+
+    template <typename IndexType_, typename ValueType_>
+    static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, ValueType_ *obs) {
+        IndexType_ i,j,k,index,mm;
+        ValueType_ alpha,v,last;
+        bool valid;
+        //ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension
+
+        //compute alpha
+        mm =(((m+blockDim.x-1)/blockDim.x)*blockDim.x); //m in multiple of blockDim.x
+        alpha=0.0;
+        //printf("[%d,%d,%d,%d] n=%d, li=%d, mn=%d \n",threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y, n, li, mn);    
+        for (j=threadIdx.y+blockIdx.y*blockDim.y; j<n; j+=blockDim.y*gridDim.y) {
+            for (i=threadIdx.x; i<mm; i+=blockDim.x) {
+                //check if the thread is valid
+                valid  = i<m;
+                
+                //get the value of the last thread
+                last = utils::shfl(alpha, blockDim.x-1, blockDim.x);      
+                
+                //if you are valid read the value from memory, otherwise set your value to 0
+                alpha = (valid) ? obs[i+j*m] : 0.0;
+                alpha = alpha*alpha;
+
+                //do prefix sum (of size warpSize=blockDim.x =< 32)
+                for (k=1; k<blockDim.x; k*=2) {
+                    v = utils::shfl_up(alpha, k, blockDim.x);
+                    if (threadIdx.x >= k) alpha+=v;
+                }
+                //shift by last
+                alpha+=last;
+            }
+        }
+
+        //scale by alpha      
+        alpha = utils::shfl(alpha, blockDim.x-1, blockDim.x);
+        alpha = std::sqrt(alpha); 
+        for (j=threadIdx.y+blockIdx.y*blockDim.y; j<n; j+=blockDim.y*gridDim.y) {
+            for (i=threadIdx.x; i<m; i+=blockDim.x) { //blockDim.x=32
+                index = i+j*m;
+                obs[index] = obs[index]/alpha;
+            }            
+        }
+    }
+
+    template <typename IndexType_>
+    IndexType_ next_pow2(IndexType_ n) {
+        IndexType_ v;
+        //Reference: 
+        //http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float
+        v = n-1;
+        v |= v >> 1;
+        v |= v >> 2;
+        v |= v >> 4;
+        v |= v >> 8;
+        v |= v >> 16;
+        return v+1;
+    }
+
+    template <typename IndexType_, typename ValueType_>
+    cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) {
+        IndexType_ p2m;
+        dim3 nthreads, nblocks;
+
+        //find next power of 2
+        p2m = next_pow2<IndexType_>(m);
+        //setup launch configuration
+        nthreads.x = max(2,min(p2m,32));
+        nthreads.y = 256/nthreads.x;
+        nthreads.z = 1;
+        nblocks.x  = 1;
+        nblocks.y  = (n + nthreads.y - 1)/nthreads.y;
+        nblocks.z  = 1;
+        //printf("m=%d(%d),n=%d,obs=%p, nthreads=(%d,%d,%d),nblocks=(%d,%d,%d)\n",m,p2m,n,obs,nthreads.x,nthreads.y,nthreads.z,nblocks.x,nblocks.y,nblocks.z);
+
+        //launch scaling kernel (scale each column of obs by its norm)
+        scale_obs_kernel<IndexType_,ValueType_><<<nblocks,nthreads>>>(m,n,obs);
+        cudaCheckError();
+
+        return cudaSuccess;
+    }
+
+  // =========================================================
+  // Spectral modularity_maximization
+  // =========================================================
+
+  /** Compute partition for a weighted undirected graph. This
+   *  partition attempts to minimize the cost function:
+   *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+   *
+   *  @param G Weighted graph in CSR format
+   *  @param nClusters Number of partitions.
+   *  @param nEigVecs Number of eigenvectors to compute.
+   *  @param maxIter_lanczos Maximum number of Lanczos iterations.
+   *  @param restartIter_lanczos Maximum size of Lanczos system before
+   *    implicit restart.
+   *  @param tol_lanczos Convergence tolerance for Lanczos method.
+   *  @param maxIter_kmeans Maximum number of k-means iterations.
+   *  @param tol_kmeans Convergence tolerance for k-means algorithm.
+   *  @param parts (Output, device memory, n entries) Cluster
+   *    assignments.
+   *  @param iters_lanczos On exit, number of Lanczos iterations
+   *    performed.
+   *  @param iters_kmeans On exit, number of k-means iterations
+   *    performed.
+   *  @return NVGRAPH error flag.
+   */
+  template <typename IndexType_, typename ValueType_>
+  NVGRAPH_ERROR modularity_maximization( ValuedCsrGraph<IndexType_,ValueType_>& G,
+           IndexType_ nClusters,
+           IndexType_ nEigVecs,
+           IndexType_ maxIter_lanczos,
+           IndexType_ restartIter_lanczos,
+           ValueType_ tol_lanczos,
+           IndexType_ maxIter_kmeans,
+           ValueType_ tol_kmeans,
+           IndexType_ * __restrict__ clusters,
+           Vector<ValueType_> &eigVals,
+           Vector<ValueType_> &eigVecs,
+           IndexType_ & iters_lanczos,
+           IndexType_ & iters_kmeans) {
+
+    // -------------------------------------------------------
+    // Check that parameters are valid
+    // -------------------------------------------------------
+
+    if(nClusters < 1) {
+      WARNING("invalid parameter (nClusters<1)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(nEigVecs < 1) {
+      WARNING("invalid parameter (nEigVecs<1)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(maxIter_lanczos < nEigVecs) {
+      WARNING("invalid parameter (maxIter_lanczos<nEigVecs)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(restartIter_lanczos < nEigVecs) {
+      WARNING("invalid parameter (restartIter_lanczos<nEigVecs)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(tol_lanczos < 0) {
+      WARNING("invalid parameter (tol_lanczos<0)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(maxIter_kmeans < 0) {
+      WARNING("invalid parameter (maxIter_kmeans<0)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(tol_kmeans < 0) {
+      WARNING("invalid parameter (tol_kmeans<0)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+
+    // -------------------------------------------------------
+    // Variable declaration
+    // -------------------------------------------------------
+
+    // Useful constants
+    const ValueType_ zero = 0;
+    const ValueType_ one  = 1;
+
+    // Loop index
+    IndexType_ i;
+
+    // Matrix dimension
+    IndexType_ n = G.get_num_vertices();
+
+    // CUDA stream
+    //   TODO: handle non-zero streams
+    cudaStream_t stream = 0;
+
+    // Matrices
+    Matrix<IndexType_, ValueType_> * A;  // Adjacency matrix
+    Matrix<IndexType_, ValueType_> * B;  // Modularity matrix
+
+    // Whether to perform full reorthogonalization in Lanczos
+    bool reorthogonalize_lanczos = false;
+
+    // k-means residual
+    ValueType_ residual_kmeans;
+
+    bool scale_eigevec_rows=true; //true; //false;
+#ifdef COLLECT_TIME_STATISTICS
+    double t1=0.0,t2=0.0;
+#endif 
+    // -------------------------------------------------------
+    // Spectral partitioner
+    // -------------------------------------------------------
+
+    // Compute eigenvectors of Modularity Matrix
+ #ifdef COLLECT_TIME_STATISTICS
+    t1=timer();
+ #endif        
+    // Initialize Modularity Matrix
+    A = new CsrMatrix<IndexType_,ValueType_>(G);
+    B = new ModularityMatrix<IndexType_,ValueType_>(*A, static_cast<IndexType_>(G.get_num_edges()));
+
+    // Compute smallest eigenvalues and eigenvectors
+#ifdef COLLECT_TIME_STATISTICS
+    t2=timer();
+    printf("%f\n",t2-t1);
+#endif        
+
+#ifdef COLLECT_TIME_STATISTICS
+    t1=timer();
+    cudaProfilerStart();
+#endif        
+
+    CHECK_NVGRAPH(computeLargestEigenvectors(*B, nEigVecs, maxIter_lanczos,
+             restartIter_lanczos, tol_lanczos,
+             reorthogonalize_lanczos, iters_lanczos,
+             eigVals.raw(), eigVecs.raw()));   
+
+ #ifdef COLLECT_TIME_STATISTICS
+    cudaProfilerStop();
+    t2=timer();
+    printf("%f\n",t2-t1);
+#endif         
+
+#ifdef COLLECT_TIME_STATISTICS
+    t1=timer();
+#endif    
+    //eigVals.dump(0, nEigVecs);
+    //eigVecs.dump(0, nEigVecs);
+    //eigVecs.dump(n, nEigVecs);
+    //eigVecs.dump(2*n, nEigVecs);
+    // Whiten eigenvector matrix
+    for(i=0; i<nEigVecs; ++i) {
+      ValueType_ mean, std;
+      mean = thrust::reduce(thrust::device_pointer_cast(eigVecs.raw()+IDX(0,i,n)),
+                            thrust::device_pointer_cast(eigVecs.raw()+IDX(0,i+1,n)));
+      cudaCheckError();
+      mean /= n;
+      thrust::transform(thrust::device_pointer_cast(eigVecs.raw()+IDX(0,i,n)),
+                        thrust::device_pointer_cast(eigVecs.raw()+IDX(0,i+1,n)), 
+                        thrust::make_constant_iterator(mean), 
+                        thrust::device_pointer_cast(eigVecs.raw()+IDX(0,i,n)), 
+                        thrust::minus<ValueType_>());
+      cudaCheckError();
+      std = Cublas::nrm2(n, eigVecs.raw()+IDX(0,i,n), 1)/std::sqrt(static_cast<ValueType_>(n));
+      thrust::transform(thrust::device_pointer_cast(eigVecs.raw()+IDX(0,i,n)),
+                        thrust::device_pointer_cast(eigVecs.raw()+IDX(0,i+1,n)),
+                        thrust::make_constant_iterator(std),
+                        thrust::device_pointer_cast(eigVecs.raw()+IDX(0,i,n)), 
+                        thrust::divides<ValueType_>());
+      cudaCheckError();
+    }    
+   delete B;
+   delete A;
+
+    // Transpose eigenvector matrix
+    //   TODO: in-place transpose
+    {
+      Vector<ValueType_> work(nEigVecs*n, stream);
+      Cublas::set_pointer_mode_host();
+      Cublas::geam(true, false, nEigVecs, n,
+       &one, eigVecs.raw(), n,
+       &zero, (ValueType_*) NULL, nEigVecs,
+       work.raw(), nEigVecs);
+      CHECK_CUDA(cudaMemcpyAsync(eigVecs.raw(), work.raw(),
+         nEigVecs*n*sizeof(ValueType_),
+         cudaMemcpyDeviceToDevice));
+    }
+
+    if (scale_eigevec_rows) {
+        //WARNING: notice that at this point the matrix has already been transposed, so we are scaling columns
+        scale_obs(nEigVecs,n,eigVecs.raw()); cudaCheckError()
+        //print_matrix<IndexType_,ValueType_,true,false>(nEigVecs-ifirst,n,obs,nEigVecs-ifirst,"Scaled obs");
+        //print_matrix<IndexType_,ValueType_,true,true>(nEigVecs-ifirst,n,obs,nEigVecs-ifirst,"Scaled obs");
+    }
+#ifdef COLLECT_TIME_STATISTICS
+    t2=timer();
+    printf("%f\n",t2-t1);
+#endif        
+
+#ifdef COLLECT_TIME_STATISTICS
+    t1=timer();
+#endif        
+    //eigVecs.dump(0, nEigVecs*n);
+    // Find partition with k-means clustering
+    CHECK_NVGRAPH(kmeans(n, nEigVecs, nClusters, 
+          tol_kmeans, maxIter_kmeans,
+          eigVecs.raw(), clusters,
+          residual_kmeans, iters_kmeans));
+#ifdef COLLECT_TIME_STATISTICS
+    t2=timer();
+    printf("%f\n\n",t2-t1);
+#endif        
+
+
+    return NVGRAPH_OK;
+  }
+  //===================================================
+  // Analysis of graph partition
+  // =========================================================
+
+  namespace {
+    /// Functor to generate indicator vectors
+    /** For use in Thrust transform
+     */
+    template <typename IndexType_, typename ValueType_>
+    struct equal_to_i_op {
+      const IndexType_ i;
+    public:
+      equal_to_i_op(IndexType_ _i) : i(_i) {}
+      template<typename Tuple_>
+      __host__ __device__ void operator()(Tuple_ t) {
+	thrust::get<1>(t)
+	  = (thrust::get<0>(t) == i) ? (ValueType_) 1.0 : (ValueType_) 0.0;
+      }
+    };
+  }
+
+  /// Compute modularity
+  /** This function determines the modularity based on a graph and cluster assignments 
+   *  @param G Weighted graph in CSR format
+   *  @param nClusters Number of clusters.
+   *  @param parts (Input, device memory, n entries) Cluster assignments.
+   *  @param modularity On exit, modularity
+   */
+ template <typename IndexType_, typename ValueType_>
+  NVGRAPH_ERROR analyzeModularity(ValuedCsrGraph<IndexType_,ValueType_> & G,
+            IndexType_ nClusters,
+            const IndexType_ * __restrict__ parts,
+            ValueType_ & modularity) {
+    
+    //using namespace thrust;
+
+    // -------------------------------------------------------
+    // Variable declaration
+    // -------------------------------------------------------
+
+    // Loop index
+    IndexType_ i;
+
+    // Matrix dimension
+    IndexType_ n = G.get_num_vertices();
+
+    // Values for computing partition cost
+    ValueType_ partModularity, partSize;
+
+    // CUDA stream
+    //   TODO: handle non-zero streams
+    cudaStream_t stream = 0;
+    
+    // Device memory
+    Vector<ValueType_> part_i(n, stream);
+    Vector<ValueType_> Bx(n, stream);
+
+    // Adjacency and Modularity matrices
+    Matrix<IndexType_, ValueType_> * A;
+    Matrix<IndexType_, ValueType_> * B;
+
+    // -------------------------------------------------------
+    // Implementation
+    // -------------------------------------------------------
+
+    // Check that parameters are valid
+    if(nClusters < 1) {
+      WARNING("invalid parameter (nClusters<1)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+
+    // Initialize cuBLAS
+    Cublas::set_pointer_mode_host();
+
+    // Initialize Modularity
+    A = new CsrMatrix<IndexType_,ValueType_>(G);
+    B = new ModularityMatrix<IndexType_,ValueType_>(*A, static_cast<IndexType_>(G.get_num_edges()));
+
+    // Debug
+    //Vector<ValueType_> ones(n,0);
+    //ones.fill(1.0);
+    //B->mv(1, ones.raw(), 0, Bx.raw());
+    //Bx.dump(0,n);
+    //Cublas::dot(n, Bx.raw(), 1, ones.raw(), 1, &partModularity);
+    //std::cout<< "sum " <<partModularity<< std::endl;
+
+    // Initialize output
+     modularity = 0;
+
+    // Iterate through partitions
+    for(i=0; i<nClusters; ++i) {
+    
+      // Construct indicator vector for ith partition
+      thrust::for_each( thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(parts),
+					      thrust::device_pointer_cast(part_i.raw()))),
+	              thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(parts+n),
+					      thrust::device_pointer_cast(part_i.raw()+n))),
+	              equal_to_i_op<IndexType_,ValueType_>(i));
+      cudaCheckError();
+
+      // Compute size of ith partition
+      Cublas::dot(n, part_i.raw(), 1, part_i.raw(), 1, &partSize);
+      partSize = round(partSize);
+      if(partSize < 0.5) {
+	WARNING("empty partition");
+	continue;
+      }
+         
+      // Compute modularity
+      B->mv(1, part_i.raw(), 0, Bx.raw());
+      Cublas::dot(n, Bx.raw(), 1, part_i.raw(), 1, &partModularity);
+
+      // Record results
+      modularity += partModularity;
+      //std::cout<< "partModularity " <<partModularity<< std::endl;
+    }
+    //modularity = modularity/nClusters;
+    // devide by nnz
+    modularity= modularity/B->getEdgeSum();
+    // Clean up and return
+    delete B;
+    delete A;
+    return NVGRAPH_OK;
+
+  }
+
+  // =========================================================
+  // Explicit instantiation
+  // =========================================================
+  template
+  NVGRAPH_ERROR modularity_maximization<int,float>( ValuedCsrGraph<int,float> & G,
+				  int nClusters,
+				  int nEigVecs,
+				  int maxIter_lanczos,
+				  int restartIter_lanczos,
+				  float tol_lanczos,
+				  int maxIter_kmeans,
+				  float tol_kmeans,
+				  int * __restrict__ parts,
+          Vector<float> &eigVals,
+          Vector<float> &eigVecs,
+				  int & iters_lanczos,
+				  int & iters_kmeans);
+  template
+  NVGRAPH_ERROR modularity_maximization<int,double>( ValuedCsrGraph<int,double> & G,
+				   int nClusters,
+				   int nEigVecs,
+				   int maxIter_lanczos,
+				   int restartIter_lanczos,
+				   double tol_lanczos,
+				   int maxIter_kmeans,
+				   double tol_kmeans,
+				   int * __restrict__ parts,
+           Vector<double> &eigVals,
+           Vector<double> &eigVecs,
+				   int & iters_lanczos,
+				   int & iters_kmeans);
+  template
+  NVGRAPH_ERROR analyzeModularity<int,float>(ValuedCsrGraph<int,float> & G,
+					 int nClusters,
+					 const int * __restrict__ parts,
+					 float & modularity);
+  template
+  NVGRAPH_ERROR analyzeModularity<int,double>(ValuedCsrGraph<int,double> & G,
+					  int nClusters,
+					  const int * __restrict__ parts,
+					  double & modularity);
+
+}
+//#endif //NVGRAPH_PARTITION
+
diff --git a/cpp/nvgraph/cpp/src/nvgraph.cu b/cpp/nvgraph/cpp/src/nvgraph.cu
new file mode 100644
index 00000000000..810df28114a
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/nvgraph.cu
@@ -0,0 +1,4224 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <climits>
+#include <cfloat>
+#include <vector>
+#include <nvlouvain.cuh>
+#include <jaccard_gpu.cuh>
+#include <cusolverDn.h>
+
+#include <nvgraph_error.hxx>
+#include <cnmem_shared_ptr.hxx>
+#include <valued_csr_graph.hxx>
+#include <multi_valued_csr_graph.hxx>
+#include <nvgraph_vector.hxx>
+#include <nvgraph_cusparse.hxx>
+#include <nvgraph_cublas.hxx>
+#include <nvgraph_csrmv.hxx>
+#include <pagerank.hxx>
+#include <arnoldi.hxx>
+#include <sssp.hxx>
+#include <widest_path.hxx>
+#include <partition.hxx>
+#include <nvgraph_convert.hxx>
+#include <size2_selector.hxx>
+#include <modularity_maximization.hxx>
+#include <bfs.hxx>
+#include <triangles_counting.hxx>
+
+#include <csrmv_cub.h>
+
+#include <nvgraph.h>   // public header **This is NVGRAPH C API**
+#include <nvgraphP.h>  // private header, contains structures, and potentially other things, used in the public C API that should never be exposed.
+#include <nvgraph_experimental.h>  // experimental header, contains hidden API entries, can be shared only under special circumstances without reveling internal things
+#include "debug_macros.h"
+
+#include "2d_partitioning.h"
+#include "bfs2d.hxx"
+
+static inline int check_context(const nvgraphHandle_t h)
+											{
+	int ret = 0;
+	if (h == NULL || !h->nvgraphIsInitialized)
+		ret = 1;
+	return ret;
+}
+
+static inline int check_graph(const nvgraphGraphDescr_t d)
+										{
+	int ret = 0;
+	if (d == NULL || d->graphStatus == IS_EMPTY)
+		ret = 1;
+	return ret;
+}
+static inline int check_topology(const nvgraphGraphDescr_t d)
+											{
+	int ret = 0;
+	if (d->graphStatus == IS_EMPTY)
+		ret = 1;
+	return ret;
+}
+
+static inline int check_int_size(size_t sz)
+											{
+	int ret = 0;
+	if (sz >= INT_MAX)
+		ret = 1;
+	return ret;
+}
+
+static inline int check_int_ptr(const int* p)
+											{
+	int ret = 0;
+	if (!p)
+		ret = 1;
+	return ret;
+}
+
+static inline int check_uniform_type_array(const cudaDataType_t * t, size_t sz)
+															{
+	int ret = 0;
+	cudaDataType_t uniform_type = t[0];
+	for (size_t i = 1; i < sz; i++)
+			{
+		if (t[i] != uniform_type)
+			ret = 1;
+	}
+	return ret;
+}
+
+template<typename T>
+bool check_ptr(const T* p)
+					{
+	bool ret = false;
+	if (!p)
+		ret = true;
+	return ret;
+}
+
+namespace nvgraph
+{
+
+//TODO: make those template functions in a separate header to be included by both
+//graph_extractor.cu and nvgraph.cpp;
+//right now this header does not exist and including graph_concrete_visitors.hxx
+//doesn't compile because of the Thrust code;
+//
+	extern CsrGraph<int>* extract_subgraph_by_vertices(CsrGraph<int>& graph,
+																		int* pV,
+																		size_t n,
+																		cudaStream_t stream);
+	extern MultiValuedCsrGraph<int, float>* extract_subgraph_by_vertices(MultiValuedCsrGraph<int,
+																										float>& graph,
+																								int* pV,
+																								size_t n,
+																								cudaStream_t stream);
+	extern MultiValuedCsrGraph<int, double>* extract_subgraph_by_vertices(MultiValuedCsrGraph<int,
+																											double>& graph,
+																									int* pV,
+																									size_t n,
+																									cudaStream_t stream);
+
+	extern CsrGraph<int>* extract_subgraph_by_edges(CsrGraph<int>& graph,
+																	int* pV,
+																	size_t n,
+																	cudaStream_t stream);
+	extern MultiValuedCsrGraph<int, float>* extract_subgraph_by_edges(MultiValuedCsrGraph<int, float>& graph,
+																							int* pV,
+																							size_t n,
+																							cudaStream_t stream);
+	extern MultiValuedCsrGraph<int, double>* extract_subgraph_by_edges(MultiValuedCsrGraph<int,
+																										double>& graph,
+																								int* pV,
+																								size_t n,
+																								cudaStream_t stream);
+
+#ifndef NVGRAPH_LIGHT
+
+	extern CsrGraph<int>* contract_graph_csr_mul(CsrGraph<int>& graph,
+																int* pV,
+																size_t n,
+																cudaStream_t stream,
+																const int& VCombine,
+																const int& VReduce,
+																const int& ECombine,
+																const int& EReduce);
+
+	extern CsrGraph<int>* contract_graph_csr_sum(CsrGraph<int>& graph,
+																int* pV,
+																size_t n,
+																cudaStream_t stream,
+																const int& VCombine,
+																const int& VReduce,
+																const int& ECombine,
+																const int& EReduce);
+
+	extern CsrGraph<int>* contract_graph_csr_min(CsrGraph<int>& graph,
+																int* pV,
+																size_t n,
+																cudaStream_t stream,
+																const int& VCombine,
+																const int& VReduce,
+																const int& ECombine,
+																const int& EReduce);
+
+	extern CsrGraph<int>* contract_graph_csr_max(CsrGraph<int>& graph,
+																int* pV,
+																size_t n,
+																cudaStream_t stream,
+																const int& VCombine,
+																const int& VReduce,
+																const int& ECombine,
+																const int& EReduce);
+
+	extern MultiValuedCsrGraph<int, float>* contract_graph_mv_float_mul(MultiValuedCsrGraph<int,
+																										float>& graph,
+																								int* pV,
+																								size_t n,
+																								cudaStream_t stream,
+																								const int& VCombine,
+																								const int& VReduce,
+																								const int& ECombine,
+																								const int& EReduce);
+
+	extern MultiValuedCsrGraph<int, float>* contract_graph_mv_float_sum(MultiValuedCsrGraph<int,
+																										float>& graph,
+																								int* pV,
+																								size_t n,
+																								cudaStream_t stream,
+																								const int& VCombine,
+																								const int& VReduce,
+																								const int& ECombine,
+																								const int& EReduce);
+
+	extern MultiValuedCsrGraph<int, float>* contract_graph_mv_float_min(MultiValuedCsrGraph<int,
+																										float>& graph,
+																								int* pV,
+																								size_t n,
+																								cudaStream_t stream,
+																								const int& VCombine,
+																								const int& VReduce,
+																								const int& ECombine,
+																								const int& EReduce);
+
+	extern MultiValuedCsrGraph<int, float>* contract_graph_mv_float_max(MultiValuedCsrGraph<int,
+																										float>& graph,
+																								int* pV,
+																								size_t n,
+																								cudaStream_t stream,
+																								const int& VCombine,
+																								const int& VReduce,
+																								const int& ECombine,
+																								const int& EReduce);
+
+	extern MultiValuedCsrGraph<int, double>* contract_graph_mv_double_mul(MultiValuedCsrGraph<int,
+																											double>& graph,
+																									int* pV,
+																									size_t n,
+																									cudaStream_t stream,
+																									const int& VCombine,
+																									const int& VReduce,
+																									const int& ECombine,
+																									const int& EReduce);
+
+	extern MultiValuedCsrGraph<int, double>* contract_graph_mv_double_sum(MultiValuedCsrGraph<int,
+																											double>& graph,
+																									int* pV,
+																									size_t n,
+																									cudaStream_t stream,
+																									const int& VCombine,
+																									const int& VReduce,
+																									const int& ECombine,
+																									const int& EReduce);
+
+	extern MultiValuedCsrGraph<int, double>* contract_graph_mv_double_min(MultiValuedCsrGraph<int,
+																											double>& graph,
+																									int* pV,
+																									size_t n,
+																									cudaStream_t stream,
+																									const int& VCombine,
+																									const int& VReduce,
+																									const int& ECombine,
+																									const int& EReduce);
+
+	extern MultiValuedCsrGraph<int, double>* contract_graph_mv_double_max(MultiValuedCsrGraph<int,
+																											double>& graph,
+																									int* pV,
+																									size_t n,
+																									cudaStream_t stream,
+																									const int& VCombine,
+																									const int& VReduce,
+																									const int& ECombine,
+																									const int& EReduce);
+#endif
+
+	nvgraphStatus_t getCAPIStatusForError(NVGRAPH_ERROR err)
+														{
+		nvgraphStatus_t ret = NVGRAPH_STATUS_SUCCESS;
+
+		switch (err)
+		{
+			case NVGRAPH_OK:
+				ret = NVGRAPH_STATUS_SUCCESS;
+				break;
+			case NVGRAPH_ERR_BAD_PARAMETERS:
+				ret = NVGRAPH_STATUS_INVALID_VALUE;
+				break;
+			case NVGRAPH_ERR_UNKNOWN:
+				ret = NVGRAPH_STATUS_INTERNAL_ERROR;
+				break;
+			case NVGRAPH_ERR_CUDA_FAILURE:
+				ret = NVGRAPH_STATUS_EXECUTION_FAILED;
+				break;
+			case NVGRAPH_ERR_THRUST_FAILURE:
+				ret = NVGRAPH_STATUS_EXECUTION_FAILED;
+				break;
+			case NVGRAPH_ERR_IO:
+				ret = NVGRAPH_STATUS_INTERNAL_ERROR;
+				break;
+			case NVGRAPH_ERR_NOT_IMPLEMENTED:
+				ret = NVGRAPH_STATUS_INVALID_VALUE;
+				break;
+			case NVGRAPH_ERR_NO_MEMORY:
+				ret = NVGRAPH_STATUS_ALLOC_FAILED;
+				break;
+			case NVGRAPH_ERR_NOT_CONVERGED:
+				ret = NVGRAPH_STATUS_NOT_CONVERGED;
+				break;
+			default:
+				ret = NVGRAPH_STATUS_INTERNAL_ERROR;
+		}
+		return ret;
+	}
+
+	extern "C" {
+		const char* nvgraphStatusGetString(nvgraphStatus_t status)
+														{
+			switch (status) {
+				case NVGRAPH_STATUS_SUCCESS:
+					return "Success";
+				case NVGRAPH_STATUS_NOT_INITIALIZED:
+					return "nvGRAPH not initialized";
+				case NVGRAPH_STATUS_ALLOC_FAILED:
+					return "nvGRAPH alloc failed";
+				case NVGRAPH_STATUS_INVALID_VALUE:
+					return "nvGRAPH invalid value";
+				case NVGRAPH_STATUS_ARCH_MISMATCH:
+					return "nvGRAPH arch mismatch";
+				case NVGRAPH_STATUS_MAPPING_ERROR:
+					return "nvGRAPH mapping error";
+				case NVGRAPH_STATUS_EXECUTION_FAILED:
+					return "nvGRAPH execution failed";
+				case NVGRAPH_STATUS_INTERNAL_ERROR:
+					return "nvGRAPH internal error";
+				case NVGRAPH_STATUS_TYPE_NOT_SUPPORTED:
+					return "nvGRAPH type not supported";
+				case NVGRAPH_STATUS_NOT_CONVERGED:
+					return "nvGRAPH algorithm failed to converge";
+				case NVGRAPH_STATUS_GRAPH_TYPE_NOT_SUPPORTED:
+					return "nvGRAPH graph type not supported";
+				default:
+					return "Unknown nvGRAPH Status";
+			}
+		}
+		;
+	}
+
+	static nvgraphStatus_t nvgraphCreateMulti_impl(struct nvgraphContext **outCtx,
+																	int numDevices,
+																	int* _devices) {
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		try
+		{
+			int device;
+
+			CHECK_CUDA(cudaFree((void * )0));
+			CHECK_CUDA(cudaGetDevice(&device));
+			struct nvgraphContext *ctx = NULL;
+			ctx = (struct nvgraphContext *) malloc(sizeof(*ctx));
+			if (!ctx) {
+				FatalError("Cannot allocate NVGRAPH context.", NVGRAPH_ERR_UNKNOWN);
+			}
+
+			//cnmem
+			memset(&ctx->cnmem_device, 0, sizeof(ctx->cnmem_device)); // init all to 0
+			ctx->cnmem_device.device = device; // cnmem runs on the device set by cudaSetDevice
+
+			size_t init_alloc = 1; // Initial allocation tentative, it is currently 1 so this feature is basically disabeled.
+
+			// Warning : Should uncomment that if using init_alloc > 1
+			//size_t freeMem, totalMem;
+			//cudaMemGetInfo(&freeMem, &totalMem);
+			//if (freeMem < init_alloc) // Couldn't find enough memory to do the initial alloc
+			//    init_alloc = 1; // (0 is used as default parameter in cnmem)
+
+			ctx->cnmem_device.size = init_alloc;
+			cnmemDevice_t* devices = (cnmemDevice_t*) malloc(sizeof(cnmemDevice_t) * numDevices);
+			memset(devices, 0, sizeof(cnmemDevice_t) * numDevices);
+			for (int i = 0; i < numDevices; i++) {
+				devices[i].device = _devices[i];
+				devices[i].size = 1;
+			}
+			cnmemStatus_t cm_status = cnmemInit(numDevices, devices, CNMEM_FLAGS_DEFAULT);
+			free(devices);
+			if (cm_status != CNMEM_STATUS_SUCCESS)
+				FatalError("Cannot initialize memory manager.", NVGRAPH_ERR_UNKNOWN);
+
+			//Cublas and Cusparse
+			nvgraph::Cusparse::get_handle();
+			nvgraph::Cublas::get_handle();
+
+			//others
+			ctx->stream = 0;
+			ctx->nvgraphIsInitialized = true;
+
+			if (outCtx) {
+				*outCtx = ctx;
+			}
+		}
+		NVGRAPH_CATCHES(rc)
+
+		return getCAPIStatusForError(rc);
+	}
+
+	static nvgraphStatus_t nvgraphCreate_impl(struct nvgraphContext **outCtx)
+															{
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		try
+		{
+			int device;
+
+			CHECK_CUDA(cudaFree((void * )0));
+			CHECK_CUDA(cudaGetDevice(&device));
+			struct nvgraphContext *ctx = NULL;
+			ctx = (struct nvgraphContext *) malloc(sizeof(*ctx));
+			if (!ctx) {
+				FatalError("Cannot allocate NVGRAPH context.", NVGRAPH_ERR_UNKNOWN);
+			}
+
+			//cnmem
+			memset(&ctx->cnmem_device, 0, sizeof(ctx->cnmem_device)); // init all to 0
+			ctx->cnmem_device.device = device; // cnmem runs on the device set by cudaSetDevice
+
+			size_t init_alloc = 1; // Initial allocation tentative, it is currently 1 so this feature is basically disabeled.
+
+			// Warning : Should uncomment that if using init_alloc > 1
+			//size_t freeMem, totalMem;
+			//cudaMemGetInfo(&freeMem, &totalMem);
+			//if (freeMem < init_alloc) // Couldn't find enough memory to do the initial alloc
+			//    init_alloc = 1; // (0 is used as default parameter in cnmem)
+
+			ctx->cnmem_device.size = init_alloc;
+
+			cnmemStatus_t cm_status = cnmemInit(1, &ctx->cnmem_device, CNMEM_FLAGS_DEFAULT);
+			if (cm_status != CNMEM_STATUS_SUCCESS)
+				FatalError("Cannot initialize memory manager.", NVGRAPH_ERR_UNKNOWN);
+
+			//Cublas and Cusparse
+			nvgraph::Cusparse::get_handle();
+			nvgraph::Cublas::get_handle();
+
+			//others
+			ctx->stream = 0;
+			ctx->nvgraphIsInitialized = true;
+
+			if (outCtx) {
+				*outCtx = ctx;
+			}
+		}
+		NVGRAPH_CATCHES(rc)
+
+		return getCAPIStatusForError(rc);
+	}
+
+	static nvgraphStatus_t nvgraphDestroy_impl(nvgraphHandle_t handle)
+																{
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		try
+		{
+			if (check_context(handle))
+				FatalError("Cannot initialize memory manager.", NVGRAPH_ERR_NO_MEMORY);
+
+			//Cublas and Cusparse
+			nvgraph::Cusparse::destroy_handle();
+			nvgraph::Cublas::destroy_handle();
+			//cnmem
+
+//     compiler is complaining, cm_status is not used in release build
+#ifdef DEBUG
+			cnmemStatus_t cm_status = cnmemFinalize();
+			if( cm_status != CNMEM_STATUS_SUCCESS ) {
+				CERR() << "Warning: " << cnmemGetErrorString(cm_status) << std::endl;
+			}
+#else
+			cnmemFinalize();
+#endif
+			//others
+			free(handle);
+		}
+		NVGRAPH_CATCHES(rc)
+
+		return getCAPIStatusForError(rc);
+	}
+
+	static nvgraphStatus_t nvgraphCreateGraphDescr_impl(nvgraphHandle_t handle,
+																			struct nvgraphGraphDescr **outGraphDescr)
+																			{
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		try
+		{
+			if (check_context(handle))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+			struct nvgraphGraphDescr *descrG = NULL;
+			descrG = (struct nvgraphGraphDescr*) malloc(sizeof(*descrG));
+			if (!descrG)
+			{
+				FatalError("Cannot allocate graph descriptor.", NVGRAPH_ERR_UNKNOWN);
+			}
+			descrG->graphStatus = IS_EMPTY;
+			if (outGraphDescr)
+			{
+				*outGraphDescr = descrG;
+			}
+		}
+		NVGRAPH_CATCHES(rc)
+
+		return getCAPIStatusForError(rc);
+	}
+
+	static nvgraphStatus_t nvgraphDestroyGraphDescr_impl(nvgraphHandle_t handle,
+																			struct nvgraphGraphDescr *descrG)
+																			{
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		try
+		{
+			if (check_context(handle))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+			if (descrG)
+			{
+				if (descrG->TT == NVGRAPH_2D_32I_32I) {
+					switch (descrG->T) {
+						case CUDA_R_32I: {
+							nvgraph::Matrix2d<int32_t, int32_t, int32_t>* m =
+									static_cast<nvgraph::Matrix2d<int32_t, int32_t, int32_t>*>(descrG->graph_handle);
+							delete m;
+							break;
+						}
+						default:
+							return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+					}
+				}
+				else {
+					switch (descrG->graphStatus) {
+						case IS_EMPTY: {
+							break;
+						}
+						case HAS_TOPOLOGY: {
+							nvgraph::CsrGraph<int> *CSRG =
+									static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
+							delete CSRG;
+							break;
+						}
+						case HAS_VALUES: {
+							if (descrG->T == CUDA_R_32F) {
+								nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+										static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+								delete MCSRG;
+							}
+							else if (descrG->T == CUDA_R_64F) {
+								nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+										static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+								delete MCSRG;
+							}
+							else if (descrG->T == CUDA_R_32I) {
+								nvgraph::MultiValuedCsrGraph<int, int> *MCSRG =
+										static_cast<nvgraph::MultiValuedCsrGraph<int, int>*>(descrG->graph_handle);
+								delete MCSRG;
+							}
+							else
+								return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+							break;
+						}
+						default:
+							return NVGRAPH_STATUS_INVALID_VALUE;
+					}
+				}
+				free(descrG);
+			}
+			else
+				return NVGRAPH_STATUS_INVALID_VALUE;
+		}
+		NVGRAPH_CATCHES(rc)
+
+		return getCAPIStatusForError(rc);
+	}
+
+	nvgraphStatus_t NVGRAPH_API nvgraphSetStream_impl(nvgraphHandle_t handle, cudaStream_t stream)
+																		{
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		try
+		{
+			if (check_context(handle))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+			//CnMem
+			cnmemStatus_t cm_status = cnmemRegisterStream(stream);
+			if (cm_status != CNMEM_STATUS_SUCCESS)
+				return NVGRAPH_STATUS_INTERNAL_ERROR;
+			// nvgraph handle
+			handle->stream = stream;
+			//Cublas and Cusparse
+			nvgraph::Cublas::setStream(stream);
+			nvgraph::Cusparse::setStream(stream);
+		}
+		NVGRAPH_CATCHES(rc)
+
+		return getCAPIStatusForError(rc);
+	}
+
+	nvgraphStatus_t NVGRAPH_API nvgraphSetGraphStructure_impl(nvgraphHandle_t handle,
+																					nvgraphGraphDescr_t descrG,
+																					void* topologyData,
+																					nvgraphTopologyType_t TT)
+																					{
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		try
+		{
+			if (check_context(handle))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+			if (descrG->graphStatus != IS_EMPTY)
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+			if (check_ptr(topologyData))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+			if (TT == NVGRAPH_CSR_32 || TT == NVGRAPH_CSC_32)
+					{
+				int v = 0, e = 0, *neighborhood = NULL, *edgedest = NULL;
+				switch (TT)
+				{
+					case NVGRAPH_CSR_32:
+						{
+						nvgraphCSRTopology32I_t t = static_cast<nvgraphCSRTopology32I_t>(topologyData);
+						if (!t->nvertices || !t->nedges || check_ptr(t->source_offsets)
+								|| check_ptr(t->destination_indices))
+							return NVGRAPH_STATUS_INVALID_VALUE;
+						v = t->nvertices;
+						e = t->nedges;
+						neighborhood = t->source_offsets;
+						edgedest = t->destination_indices;
+						break;
+					}
+					case NVGRAPH_CSC_32:
+						{
+						nvgraphCSCTopology32I_t t = static_cast<nvgraphCSCTopology32I_t>(topologyData);
+						if (!t->nvertices || !t->nedges || check_ptr(t->destination_offsets)
+								|| check_ptr(t->source_indices))
+							return NVGRAPH_STATUS_INVALID_VALUE;
+						v = t->nvertices;
+						e = t->nedges;
+						neighborhood = t->destination_offsets;
+						edgedest = t->source_indices;
+						break;
+					}
+					default:
+						return NVGRAPH_STATUS_INVALID_VALUE;
+				}
+
+				descrG->TT = TT;
+
+				// Create the internal CSR representation
+				nvgraph::CsrGraph<int> * CSRG = new nvgraph::CsrGraph<int>(v, e, handle->stream);
+
+				CHECK_CUDA(cudaMemcpy(CSRG->get_raw_row_offsets(),
+												neighborhood,
+												(size_t )((CSRG->get_num_vertices() + 1) * sizeof(int)),
+												cudaMemcpyDefault));
+
+				CHECK_CUDA(cudaMemcpy(CSRG->get_raw_column_indices(),
+												edgedest,
+												(size_t )((CSRG->get_num_edges()) * sizeof(int)),
+												cudaMemcpyDefault));
+
+				// Set the graph handle
+				descrG->graph_handle = CSRG;
+				descrG->graphStatus = HAS_TOPOLOGY;
+			}
+			else if (TT == NVGRAPH_2D_32I_32I) {
+				nvgraph2dCOOTopology32I_t td = static_cast<nvgraph2dCOOTopology32I_t>(topologyData);
+				switch (td->valueType) {
+					case CUDA_R_32I: {
+						if (!td->nvertices || !td->nedges || !td->source_indices
+								|| !td->destination_indices || !td->numDevices || !td->devices
+								|| !td->blockN)
+							return NVGRAPH_STATUS_INVALID_VALUE;
+						descrG->TT = TT;
+						descrG->graphStatus = HAS_TOPOLOGY;
+						if (td->values)
+							descrG->graphStatus = HAS_VALUES;
+						descrG->T = td->valueType;
+						std::vector<int32_t> devices;
+						for (int32_t i = 0; i < td->numDevices; i++)
+							devices.push_back(td->devices[i]);
+						nvgraph::MatrixDecompositionDescription<int32_t, int32_t> description(	td->nvertices,
+																														td->blockN,
+																														td->nedges,
+																														devices);
+						nvgraph::Matrix2d<int32_t, int32_t, int32_t>* m = new nvgraph::Matrix2d<int32_t,
+								int32_t, int32_t>();
+						*m = nvgraph::COOto2d(description,
+														td->source_indices,
+														td->destination_indices,
+														(int32_t*) td->values);
+						descrG->graph_handle = m;
+						break;
+					}
+					default: {
+						return NVGRAPH_STATUS_INVALID_VALUE;
+					}
+				}
+			}
+			else
+			{
+				return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+			}
+
+		}
+		NVGRAPH_CATCHES(rc)
+
+		return getCAPIStatusForError(rc);
+
+	}
+
+	nvgraphStatus_t NVGRAPH_API nvgraphAttachGraphStructure_impl(nvgraphHandle_t handle,
+															nvgraphGraphDescr_t descrG,
+															void* topologyData,
+															nvgraphTopologyType_t TT)
+																					{
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		try
+		{
+			if (check_context(handle))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+			if (descrG->graphStatus != IS_EMPTY)
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+			if (check_ptr(topologyData))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+			if (TT == NVGRAPH_CSR_32 || TT == NVGRAPH_CSC_32)
+					{
+				int v = 0, e = 0, *neighborhood = NULL, *edgedest = NULL;
+				switch (TT)
+				{
+					case NVGRAPH_CSR_32:
+						{
+						nvgraphCSRTopology32I_t t = static_cast<nvgraphCSRTopology32I_t>(topologyData);
+						if (!t->nvertices || !t->nedges || check_ptr(t->source_offsets)
+								|| check_ptr(t->destination_indices))
+							return NVGRAPH_STATUS_INVALID_VALUE;
+						v = t->nvertices;
+						e = t->nedges;
+						neighborhood = t->source_offsets;
+						edgedest = t->destination_indices;
+						break;
+					}
+					case NVGRAPH_CSC_32:
+						{
+						nvgraphCSCTopology32I_t t = static_cast<nvgraphCSCTopology32I_t>(topologyData);
+						if (!t->nvertices || !t->nedges || check_ptr(t->destination_offsets)
+								|| check_ptr(t->source_indices))
+							return NVGRAPH_STATUS_INVALID_VALUE;
+						v = t->nvertices;
+						e = t->nedges;
+						neighborhood = t->destination_offsets;
+						edgedest = t->source_indices;
+						break;
+					}
+					default:
+						return NVGRAPH_STATUS_INVALID_VALUE;
+				}
+
+				descrG->TT = TT;
+
+				// Create the internal CSR representation
+				nvgraph::CsrGraph<int> * CSRG = new nvgraph::CsrGraph<int>(v, e, handle->stream);
+
+				CSRG->set_raw_row_offsets(neighborhood);
+				CSRG->set_raw_column_indices(edgedest);
+
+				// Set the graph handle
+				descrG->graph_handle = CSRG;
+				descrG->graphStatus = HAS_TOPOLOGY;
+			}
+			else if (TT == NVGRAPH_2D_32I_32I) {
+				nvgraph2dCOOTopology32I_t td = static_cast<nvgraph2dCOOTopology32I_t>(topologyData);
+				switch (td->valueType) {
+					case CUDA_R_32I: {
+						if (!td->nvertices || !td->nedges || !td->source_indices
+								|| !td->destination_indices || !td->numDevices || !td->devices
+								|| !td->blockN)
+							return NVGRAPH_STATUS_INVALID_VALUE;
+						descrG->TT = TT;
+						descrG->graphStatus = HAS_TOPOLOGY;
+						if (td->values)
+							descrG->graphStatus = HAS_VALUES;
+						descrG->T = td->valueType;
+						std::vector<int32_t> devices;
+						for (int32_t i = 0; i < td->numDevices; i++)
+							devices.push_back(td->devices[i]);
+						nvgraph::MatrixDecompositionDescription<int32_t, int32_t> description(	td->nvertices,
+																														td->blockN,
+																														td->nedges,
+																														devices);
+						nvgraph::Matrix2d<int32_t, int32_t, int32_t>* m = new nvgraph::Matrix2d<int32_t,
+								int32_t, int32_t>();
+						*m = nvgraph::COOto2d(description,
+														td->source_indices,
+														td->destination_indices,
+														(int32_t*) td->values);
+						descrG->graph_handle = m;
+						break;
+					}
+					default: {
+						return NVGRAPH_STATUS_INVALID_VALUE;
+					}
+				}
+			}
+			else
+			{
+				return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+			}
+
+		}
+		NVGRAPH_CATCHES(rc)
+
+		return getCAPIStatusForError(rc);
+
+	}
+
+	nvgraphStatus_t NVGRAPH_API nvgraphGetGraphStructure_impl(nvgraphHandle_t handle,
+																					nvgraphGraphDescr_t descrG,
+																					void* topologyData,
+																					nvgraphTopologyType_t* TT)
+																					{
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		try
+		{
+			if (check_context(handle) || check_graph(descrG) || check_topology(descrG))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+			nvgraphTopologyType_t graphTType = descrG->TT;
+
+			if (TT != NULL)
+				*TT = graphTType;
+
+			if (topologyData != NULL) {
+				nvgraph::CsrGraph<int> *CSRG =
+						static_cast<nvgraph::CsrGraph<int> *>(descrG->graph_handle);
+				int v = static_cast<int>(CSRG->get_num_vertices());
+				int e = static_cast<int>(CSRG->get_num_edges());
+				int *neighborhood = NULL, *edgedest = NULL;
+
+				switch (graphTType)
+				{
+					case NVGRAPH_CSR_32:
+						{
+						nvgraphCSRTopology32I_t t = static_cast<nvgraphCSRTopology32I_t>(topologyData);
+						t->nvertices = static_cast<int>(v);
+						t->nedges = static_cast<int>(e);
+						neighborhood = t->source_offsets;
+						edgedest = t->destination_indices;
+						break;
+					}
+					case NVGRAPH_CSC_32:
+						{
+						nvgraphCSCTopology32I_t t = static_cast<nvgraphCSCTopology32I_t>(topologyData);
+						t->nvertices = static_cast<int>(v);
+						t->nedges = static_cast<int>(e);
+						neighborhood = t->destination_offsets;
+						edgedest = t->source_indices;
+						break;
+					}
+					default:
+						return NVGRAPH_STATUS_INTERNAL_ERROR;
+				}
+
+				if (neighborhood != NULL) {
+					CHECK_CUDA(cudaMemcpy(neighborhood,
+													CSRG->get_raw_row_offsets(),
+													(size_t )((v + 1) * sizeof(int)),
+													cudaMemcpyDefault));
+				}
+
+				if (edgedest != NULL) {
+					CHECK_CUDA(cudaMemcpy(edgedest,
+													CSRG->get_raw_column_indices(),
+													(size_t )((e) * sizeof(int)),
+													cudaMemcpyDefault));
+				}
+
+			}
+		}
+		NVGRAPH_CATCHES(rc)
+		return getCAPIStatusForError(rc);
+	}
+
+	nvgraphStatus_t NVGRAPH_API nvgraphAllocateVertexData_impl(nvgraphHandle_t handle,
+																					nvgraphGraphDescr_t descrG,
+																					size_t numsets,
+																					cudaDataType_t *settypes)
+																					{
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		try
+		{
+			if (check_context(handle) || check_graph(descrG) || check_int_size(numsets)
+					|| check_ptr(settypes))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+			if (check_uniform_type_array(settypes, numsets))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+			if (descrG->graphStatus == HAS_TOPOLOGY) // need to convert CsrGraph to MultiValuedCsrGraph first
+					{
+				if (*settypes == CUDA_R_32F)
+						{
+					nvgraph::CsrGraph<int> *CSRG =
+							static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
+					nvgraph::MultiValuedCsrGraph<int, float> *MCSRG = new nvgraph::MultiValuedCsrGraph<
+							int, float>(*CSRG);
+					descrG->graph_handle = MCSRG;
+				}
+				else if (*settypes == CUDA_R_64F)
+						{
+					nvgraph::CsrGraph<int> *CSRG =
+							static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
+					nvgraph::MultiValuedCsrGraph<int, double> *MCSRG = new nvgraph::MultiValuedCsrGraph<
+							int, double>(*CSRG);
+					descrG->graph_handle = MCSRG;
+				}
+				else if (*settypes == CUDA_R_32I)
+						{
+					nvgraph::CsrGraph<int> *CSRG =
+							static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
+					nvgraph::MultiValuedCsrGraph<int, int> *MCSRG = new nvgraph::MultiValuedCsrGraph<int,
+							int>(*CSRG);
+					descrG->graph_handle = MCSRG;
+				}
+				else
+					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+				descrG->T = *settypes;
+				descrG->graphStatus = HAS_VALUES;
+			}
+			else if (descrG->graphStatus == HAS_VALUES) // Already in MultiValuedCsrGraph, just need to check the type
+					{
+				if (*settypes != descrG->T)
+					return NVGRAPH_STATUS_INVALID_VALUE;
+			}
+			else
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			// Allocate and transfer
+			if (*settypes == CUDA_R_32F)
+					{
+				nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+						static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+				MCSRG->allocateVertexData(numsets, NULL);
+			}
+			else if (*settypes == CUDA_R_64F)
+					{
+				nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+						static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+				MCSRG->allocateVertexData(numsets, NULL);
+			}
+			else if (*settypes == CUDA_R_32I)
+					{
+				nvgraph::MultiValuedCsrGraph<int, int> *MCSRG =
+						static_cast<nvgraph::MultiValuedCsrGraph<int, int>*>(descrG->graph_handle);
+				MCSRG->allocateVertexData(numsets, NULL);
+			}
+			else
+				return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+		}
+		NVGRAPH_CATCHES(rc)
+
+		return getCAPIStatusForError(rc);
+	}
+
+	nvgraphStatus_t NVGRAPH_API nvgraphAttachVertexData_impl(nvgraphHandle_t handle,
+															 nvgraphGraphDescr_t descrG,
+															 size_t setnum,
+															 cudaDataType_t settype,
+															 void *vertexData)
+															 {
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		try
+		{
+			if (check_context(handle) || check_graph(descrG) || check_int_size(setnum))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+			if (descrG->graphStatus == HAS_TOPOLOGY) // need to convert CsrGraph to MultiValuedCsrGraph first
+					{
+				if (settype == CUDA_R_32F)
+						{
+					nvgraph::CsrGraph<int> *CSRG =
+							static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
+					nvgraph::MultiValuedCsrGraph<int, float> *MCSRG = new nvgraph::MultiValuedCsrGraph<
+							int, float>(*CSRG);
+					descrG->graph_handle = MCSRG;
+				}
+				else if (settype == CUDA_R_64F)
+						{
+					nvgraph::CsrGraph<int> *CSRG =
+							static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
+					nvgraph::MultiValuedCsrGraph<int, double> *MCSRG = new nvgraph::MultiValuedCsrGraph<
+							int, double>(*CSRG);
+					descrG->graph_handle = MCSRG;
+				}
+				else if (settype == CUDA_R_32I)
+						{
+					nvgraph::CsrGraph<int> *CSRG =
+							static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
+					nvgraph::MultiValuedCsrGraph<int, int> *MCSRG = new nvgraph::MultiValuedCsrGraph<int,
+							int>(*CSRG);
+					descrG->graph_handle = MCSRG;
+				}
+				else
+					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+				descrG->T = settype;
+				descrG->graphStatus = HAS_VALUES;
+			}
+			else if (descrG->graphStatus == HAS_VALUES) // Already in MultiValuedCsrGraph, just need to check the type
+					{
+				if (settype != descrG->T)
+					return NVGRAPH_STATUS_INVALID_VALUE;
+			}
+			else
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			// transfer
+			if (settype == CUDA_R_32F)
+					{
+				nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+						static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+				MCSRG->attachVertexData(setnum, (float*)vertexData, NULL);
+			}
+			else if (settype == CUDA_R_64F)
+					{
+				nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+						static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+				MCSRG->attachVertexData(setnum, (double*)vertexData, NULL);
+			}
+			else if (settype == CUDA_R_32I)
+					{
+				nvgraph::MultiValuedCsrGraph<int, int> *MCSRG =
+						static_cast<nvgraph::MultiValuedCsrGraph<int, int>*>(descrG->graph_handle);
+				MCSRG->attachVertexData(setnum, (int*)vertexData, NULL);
+			}
+			else
+				return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+		}
+		NVGRAPH_CATCHES(rc)
+
+		return getCAPIStatusForError(rc);
+	}
+	nvgraphStatus_t NVGRAPH_API nvgraphAllocateEdgeData_impl(nvgraphHandle_t handle,
+																				nvgraphGraphDescr_t descrG,
+																				size_t numsets,
+																				cudaDataType_t *settypes)
+																				{
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		try
+		{
+			if (check_context(handle) || check_graph(descrG) || check_int_size(numsets)
+					|| check_ptr(settypes))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+			if (check_uniform_type_array(settypes, numsets))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+			// Look at what kind of graph we have
+			if (descrG->graphStatus == HAS_TOPOLOGY) // need to convert CsrGraph to MultiValuedCsrGraph first
+					{
+				if (*settypes == CUDA_R_32F)
+						{
+					nvgraph::CsrGraph<int> *CSRG =
+							static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
+					nvgraph::MultiValuedCsrGraph<int, float> *MCSRG = new nvgraph::MultiValuedCsrGraph<
+							int, float>(*CSRG);
+					descrG->graph_handle = MCSRG;
+				}
+				else if (*settypes == CUDA_R_64F)
+						{
+					nvgraph::CsrGraph<int> *CSRG =
+							static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
+					nvgraph::MultiValuedCsrGraph<int, double> *MCSRG = new nvgraph::MultiValuedCsrGraph<
+							int, double>(*CSRG);
+					descrG->graph_handle = MCSRG;
+				}
+				else if (*settypes == CUDA_R_32I)
+						{
+					nvgraph::CsrGraph<int> *CSRG =
+							static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
+					nvgraph::MultiValuedCsrGraph<int, int> *MCSRG = new nvgraph::MultiValuedCsrGraph<int,
+							int>(*CSRG);
+					descrG->graph_handle = MCSRG;
+				}
+				else
+					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+				descrG->T = *settypes;
+				descrG->graphStatus = HAS_VALUES;
+			}
+			else if (descrG->graphStatus == HAS_VALUES) // Already in MultiValuedCsrGraph, just need to check the type
+					{
+				if (*settypes != descrG->T)
+					return NVGRAPH_STATUS_INVALID_VALUE;
+			}
+			else
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			// Allocate and transfer
+			if (*settypes == CUDA_R_32F)
+					{
+				nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+						static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+				MCSRG->allocateEdgeData(numsets, NULL);
+			}
+			else if (*settypes == CUDA_R_64F)
+					{
+				nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+						static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+				MCSRG->allocateEdgeData(numsets, NULL);
+			}
+			else if (*settypes == CUDA_R_32I)
+					{
+				nvgraph::MultiValuedCsrGraph<int, int> *MCSRG =
+						static_cast<nvgraph::MultiValuedCsrGraph<int, int>*>(descrG->graph_handle);
+				MCSRG->allocateEdgeData(numsets, NULL);
+			}
+			else
+				return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+
+		}
+		NVGRAPH_CATCHES(rc)
+
+		return getCAPIStatusForError(rc);
+	}
+
+	nvgraphStatus_t NVGRAPH_API nvgraphAttachEdgeData_impl(nvgraphHandle_t handle,
+														   nvgraphGraphDescr_t descrG,
+														   size_t setnum,
+														   cudaDataType_t settype,
+														   void *edgeData)
+														   {
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		try
+		{
+			if (check_context(handle) || check_graph(descrG) || check_int_size(setnum))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+			// Look at what kind of graph we have
+			if (descrG->graphStatus == HAS_TOPOLOGY) // need to convert CsrGraph to MultiValuedCsrGraph first
+					{
+				if (settype == CUDA_R_32F)
+						{
+					nvgraph::CsrGraph<int> *CSRG =
+							static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
+					nvgraph::MultiValuedCsrGraph<int, float> *MCSRG = new nvgraph::MultiValuedCsrGraph<
+							int, float>(*CSRG);
+					descrG->graph_handle = MCSRG;
+				}
+				else if (settype == CUDA_R_64F)
+						{
+					nvgraph::CsrGraph<int> *CSRG =
+							static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
+					nvgraph::MultiValuedCsrGraph<int, double> *MCSRG = new nvgraph::MultiValuedCsrGraph<
+							int, double>(*CSRG);
+					descrG->graph_handle = MCSRG;
+				}
+				else if (settype == CUDA_R_32I)
+						{
+					nvgraph::CsrGraph<int> *CSRG =
+							static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
+					nvgraph::MultiValuedCsrGraph<int, int> *MCSRG = new nvgraph::MultiValuedCsrGraph<int,
+							int>(*CSRG);
+					descrG->graph_handle = MCSRG;
+				}
+				else
+					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+				descrG->T = settype;
+				descrG->graphStatus = HAS_VALUES;
+			}
+			else if (descrG->graphStatus == HAS_VALUES) // Already in MultiValuedCsrGraph, just need to check the type
+					{
+				if (settype != descrG->T)
+					return NVGRAPH_STATUS_INVALID_VALUE;
+			}
+			else
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			// Allocate and transfer
+			if (settype == CUDA_R_32F)
+					{
+				nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+						static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+				MCSRG->attachEdgeData(setnum, (float*)edgeData, NULL);
+			}
+			else if (settype == CUDA_R_64F)
+					{
+				nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+						static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+				MCSRG->attachEdgeData(setnum, (double*)edgeData, NULL);
+			}
+			else if (settype == CUDA_R_32I)
+					{
+				nvgraph::MultiValuedCsrGraph<int, int> *MCSRG =
+						static_cast<nvgraph::MultiValuedCsrGraph<int, int>*>(descrG->graph_handle);
+				MCSRG->attachEdgeData(setnum, (int*)edgeData, NULL);
+			}
+			else
+				return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+
+		}
+		NVGRAPH_CATCHES(rc)
+
+		return getCAPIStatusForError(rc);
+	}
+
+	nvgraphStatus_t NVGRAPH_API nvgraphSetVertexData_impl(nvgraphHandle_t handle,
+																			nvgraphGraphDescr_t descrG,
+																			void *vertexData,
+																			size_t setnum)
+																			{
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		try
+		{
+			if (check_context(handle) || check_graph(descrG) || check_int_size(setnum)
+					|| check_ptr(vertexData))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+			if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
+				FatalError("Graph should have allocated values.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+			if (descrG->T == CUDA_R_32F)
+					{
+				nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+						static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+				if (setnum >= MCSRG->get_num_vertex_dim()) // base index is 0
+					return NVGRAPH_STATUS_INVALID_VALUE;
+				cudaMemcpy(MCSRG->get_raw_vertex_dim(setnum),
+								(float*) vertexData,
+								(size_t) ((MCSRG->get_num_vertices()) * sizeof(float)),
+								cudaMemcpyDefault);
+			}
+			else if (descrG->T == CUDA_R_64F)
+					{
+				nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+						static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+				if (setnum >= MCSRG->get_num_vertex_dim()) // base index is 0
+					return NVGRAPH_STATUS_INVALID_VALUE;
+				cudaMemcpy(MCSRG->get_raw_vertex_dim(setnum),
+								(double*) vertexData,
+								(size_t) ((MCSRG->get_num_vertices()) * sizeof(double)),
+								cudaMemcpyDefault);
+			}
+			else if (descrG->T == CUDA_R_32I)
+					{
+				nvgraph::MultiValuedCsrGraph<int, int> *MCSRG =
+						static_cast<nvgraph::MultiValuedCsrGraph<int, int>*>(descrG->graph_handle);
+				if (setnum >= MCSRG->get_num_vertex_dim()) // base index is 0
+					return NVGRAPH_STATUS_INVALID_VALUE;
+				cudaMemcpy(MCSRG->get_raw_vertex_dim(setnum),
+								(int*) vertexData,
+								(size_t) ((MCSRG->get_num_vertices()) * sizeof(int)),
+								cudaMemcpyDefault);
+			}
+			else
+				return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+
+			cudaCheckError()
+							;
+
+		}
+		NVGRAPH_CATCHES(rc)
+
+		return getCAPIStatusForError(rc);
+	}
+
+	nvgraphStatus_t NVGRAPH_API nvgraphGetVertexData_impl(nvgraphHandle_t handle,
+																			nvgraphGraphDescr_t descrG,
+																			void *vertexData,
+																			size_t setnum)
+																			{
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		try
+		{
+			if (check_context(handle) || check_graph(descrG) || check_int_size(setnum)
+					|| check_ptr(vertexData))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+			if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
+				FatalError("Graph should have values.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+			if (descrG->T == CUDA_R_32F)
+					{
+				nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+						static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+				if (setnum >= MCSRG->get_num_vertex_dim()) // base index is 0
+					return NVGRAPH_STATUS_INVALID_VALUE;
+				cudaMemcpy((float*) vertexData,
+								MCSRG->get_raw_vertex_dim(setnum),
+								(size_t) ((MCSRG->get_num_vertices()) * sizeof(float)),
+								cudaMemcpyDefault);
+			}
+			else if (descrG->T == CUDA_R_64F)
+					{
+				nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+						static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+				if (setnum >= MCSRG->get_num_vertex_dim()) // base index is 0
+					return NVGRAPH_STATUS_INVALID_VALUE;
+				cudaMemcpy((double*) vertexData,
+								MCSRG->get_raw_vertex_dim(setnum),
+								(size_t) ((MCSRG->get_num_vertices()) * sizeof(double)),
+								cudaMemcpyDefault);
+			}
+			else if (descrG->T == CUDA_R_32I)
+					{
+				nvgraph::MultiValuedCsrGraph<int, int> *MCSRG =
+						static_cast<nvgraph::MultiValuedCsrGraph<int, int>*>(descrG->graph_handle);
+				if (setnum >= MCSRG->get_num_vertex_dim()) // base index is 0
+					return NVGRAPH_STATUS_INVALID_VALUE;
+				cudaMemcpy((int*) vertexData,
+								MCSRG->get_raw_vertex_dim(setnum),
+								(size_t) ((MCSRG->get_num_vertices()) * sizeof(int)),
+								cudaMemcpyDefault);
+			}
+			else
+				return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+
+			cudaCheckError()
+							;
+
+		}
+		NVGRAPH_CATCHES(rc)
+
+		return getCAPIStatusForError(rc);
+	}
+
+	nvgraphStatus_t NVGRAPH_API nvgraphConvertTopology_impl(nvgraphHandle_t handle,
+																				nvgraphTopologyType_t srcTType,
+																				void *srcTopology,
+																				void *srcEdgeData,
+																				cudaDataType_t *dataType,
+																				nvgraphTopologyType_t dstTType,
+																				void *dstTopology,
+																				void *dstEdgeData)
+																				{
+
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		try
+		{
+			if (check_context(handle) || check_ptr(dstEdgeData) || check_ptr(srcEdgeData))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+			size_t sizeT;
+			if (*dataType == CUDA_R_32F)
+				sizeT = sizeof(float);
+			else if (*dataType == CUDA_R_64F)
+				sizeT = sizeof(double);
+			else
+				return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+
+			// Trust me, this better than nested if's.
+			if (srcTType == NVGRAPH_CSR_32 && dstTType == NVGRAPH_CSR_32) {                  // CSR2CSR
+				nvgraphCSRTopology32I_t srcT = static_cast<nvgraphCSRTopology32I_t>(srcTopology);
+				nvgraphCSRTopology32I_t dstT = static_cast<nvgraphCSRTopology32I_t>(dstTopology);
+				dstT->nvertices = srcT->nvertices;
+				dstT->nedges = srcT->nedges;
+				CHECK_CUDA(cudaMemcpy(dstT->source_offsets,
+												srcT->source_offsets,
+												(srcT->nvertices + 1) * sizeof(int),
+												cudaMemcpyDefault));
+				CHECK_CUDA(cudaMemcpy(dstT->destination_indices,
+												srcT->destination_indices,
+												srcT->nedges * sizeof(int),
+												cudaMemcpyDefault));
+				CHECK_CUDA(cudaMemcpy(dstEdgeData,
+												srcEdgeData,
+												srcT->nedges * sizeT,
+												cudaMemcpyDefault));
+			} else if (srcTType == NVGRAPH_CSR_32 && dstTType == NVGRAPH_CSC_32) {           // CSR2CSC
+				nvgraphCSRTopology32I_t srcT = static_cast<nvgraphCSRTopology32I_t>(srcTopology);
+				nvgraphCSCTopology32I_t dstT = static_cast<nvgraphCSCTopology32I_t>(dstTopology);
+				dstT->nvertices = srcT->nvertices;
+				dstT->nedges = srcT->nedges;
+				csr2csc(srcT->nvertices, srcT->nvertices, srcT->nedges,
+							srcEdgeData,
+							srcT->source_offsets, srcT->destination_indices,
+							dstEdgeData,
+							dstT->source_indices, dstT->destination_offsets,
+							CUSPARSE_ACTION_NUMERIC,
+							CUSPARSE_INDEX_BASE_ZERO, dataType);
+			} else if (srcTType == NVGRAPH_CSR_32 && dstTType == NVGRAPH_COO_32) {           // CSR2COO
+				nvgraphCSRTopology32I_t srcT = static_cast<nvgraphCSRTopology32I_t>(srcTopology);
+				nvgraphCOOTopology32I_t dstT = static_cast<nvgraphCOOTopology32I_t>(dstTopology);
+				dstT->nvertices = srcT->nvertices;
+				dstT->nedges = srcT->nedges;
+				if (dstT->tag == NVGRAPH_SORTED_BY_SOURCE || dstT->tag == NVGRAPH_DEFAULT
+						|| dstT->tag == NVGRAPH_UNSORTED) {
+					csr2coo(srcT->source_offsets,
+								srcT->nedges,
+								srcT->nvertices,
+								dstT->source_indices,
+								CUSPARSE_INDEX_BASE_ZERO);
+					CHECK_CUDA(cudaMemcpy(dstT->destination_indices,
+													srcT->destination_indices,
+													srcT->nedges * sizeof(int),
+													cudaMemcpyDefault));
+					CHECK_CUDA(cudaMemcpy(dstEdgeData,
+													srcEdgeData,
+													srcT->nedges * sizeT,
+													cudaMemcpyDefault));
+				} else if (dstT->tag == NVGRAPH_SORTED_BY_DESTINATION) {
+					// Step 1: Convert to COO_Source
+					csr2coo(srcT->source_offsets,
+								srcT->nedges,
+								srcT->nvertices,
+								dstT->source_indices,
+								CUSPARSE_INDEX_BASE_ZERO);
+					// Step 2: Convert to COO_Destination
+					cooSortByDestination(srcT->nvertices, srcT->nvertices, srcT->nedges,
+												srcEdgeData,
+												dstT->source_indices, srcT->destination_indices,
+												dstEdgeData,
+												dstT->source_indices, dstT->destination_indices,
+												CUSPARSE_INDEX_BASE_ZERO,
+												dataType);
+				} else {
+					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+				}
+				///////////////////////////////////////////////////////////////////////////////////////////////////////////
+			} else if (srcTType == NVGRAPH_CSC_32 && dstTType == NVGRAPH_CSR_32) {           // CSC2CSR
+				nvgraphCSCTopology32I_t srcT = static_cast<nvgraphCSCTopology32I_t>(srcTopology);
+				nvgraphCSRTopology32I_t dstT = static_cast<nvgraphCSRTopology32I_t>(dstTopology);
+				dstT->nvertices = srcT->nvertices;
+				dstT->nedges = srcT->nedges;
+				csc2csr(srcT->nvertices, srcT->nvertices, srcT->nedges,
+							srcEdgeData,
+							srcT->source_indices, srcT->destination_offsets,
+							dstEdgeData,
+							dstT->source_offsets, dstT->destination_indices,
+							CUSPARSE_ACTION_NUMERIC,
+							CUSPARSE_INDEX_BASE_ZERO, dataType);
+			} else if (srcTType == NVGRAPH_CSC_32 && dstTType == NVGRAPH_CSC_32) {           // CSC2CSC
+				nvgraphCSCTopology32I_t srcT = static_cast<nvgraphCSCTopology32I_t>(srcTopology);
+				nvgraphCSCTopology32I_t dstT = static_cast<nvgraphCSCTopology32I_t>(dstTopology);
+				dstT->nvertices = srcT->nvertices;
+				dstT->nedges = srcT->nedges;
+				CHECK_CUDA(cudaMemcpy(dstT->destination_offsets,
+												srcT->destination_offsets,
+												(srcT->nvertices + 1) * sizeof(int),
+												cudaMemcpyDefault));
+				CHECK_CUDA(cudaMemcpy(dstT->source_indices,
+												srcT->source_indices,
+												srcT->nedges * sizeof(int),
+												cudaMemcpyDefault));
+				CHECK_CUDA(cudaMemcpy(dstEdgeData,
+												srcEdgeData,
+												srcT->nedges * sizeT,
+												cudaMemcpyDefault));
+			} else if (srcTType == NVGRAPH_CSC_32 && dstTType == NVGRAPH_COO_32) {           // CSC2COO
+				nvgraphCSCTopology32I_t srcT = static_cast<nvgraphCSCTopology32I_t>(srcTopology);
+				nvgraphCOOTopology32I_t dstT = static_cast<nvgraphCOOTopology32I_t>(dstTopology);
+				dstT->nvertices = srcT->nvertices;
+				dstT->nedges = srcT->nedges;
+				if (dstT->tag == NVGRAPH_SORTED_BY_SOURCE) {
+					// Step 1: Convert to COO_Destination
+					csr2coo(srcT->destination_offsets,
+								srcT->nedges,
+								srcT->nvertices,
+								dstT->destination_indices,
+								CUSPARSE_INDEX_BASE_ZERO);
+					// Step 2: Convert to COO_Source
+					cooSortBySource(srcT->nvertices, srcT->nvertices, srcT->nedges,
+											srcEdgeData,
+											srcT->source_indices, dstT->destination_indices,
+											dstEdgeData,
+											dstT->source_indices, dstT->destination_indices,
+											CUSPARSE_INDEX_BASE_ZERO,
+											dataType);
+				} else if (dstT->tag == NVGRAPH_SORTED_BY_DESTINATION || dstT->tag == NVGRAPH_DEFAULT
+						|| dstT->tag == NVGRAPH_UNSORTED) {
+					csr2coo(srcT->destination_offsets,
+								srcT->nedges,
+								srcT->nvertices,
+								dstT->destination_indices,
+								CUSPARSE_INDEX_BASE_ZERO);
+					CHECK_CUDA(cudaMemcpy(dstT->source_indices,
+													srcT->source_indices,
+													srcT->nedges * sizeof(int),
+													cudaMemcpyDefault));
+					CHECK_CUDA(cudaMemcpy(dstEdgeData,
+													srcEdgeData,
+													srcT->nedges * sizeT,
+													cudaMemcpyDefault));
+				} else {
+					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+				}
+				///////////////////////////////////////////////////////////////////////////////////////////////////////////
+			} else if (srcTType == NVGRAPH_COO_32 && dstTType == NVGRAPH_CSR_32) {           // COO2CSR
+				nvgraphCOOTopology32I_t srcT = static_cast<nvgraphCOOTopology32I_t>(srcTopology);
+				nvgraphCSRTopology32I_t dstT = static_cast<nvgraphCSRTopology32I_t>(dstTopology);
+				dstT->nvertices = srcT->nvertices;
+				dstT->nedges = srcT->nedges;
+				if (srcT->tag == NVGRAPH_SORTED_BY_SOURCE) {
+					coo2csr(srcT->source_indices,
+								srcT->nedges,
+								srcT->nvertices,
+								dstT->source_offsets,
+								CUSPARSE_INDEX_BASE_ZERO);
+					CHECK_CUDA(cudaMemcpy(dstT->destination_indices,
+													srcT->destination_indices,
+													srcT->nedges * sizeof(int),
+													cudaMemcpyDefault));
+					CHECK_CUDA(cudaMemcpy(dstEdgeData,
+													srcEdgeData,
+													srcT->nedges * sizeT,
+													cudaMemcpyDefault));
+				} else if (srcT->tag == NVGRAPH_SORTED_BY_DESTINATION) {
+					cood2csr(srcT->nvertices, srcT->nvertices, srcT->nedges,
+								srcEdgeData,
+								srcT->source_indices, srcT->destination_indices,
+								dstEdgeData,
+								dstT->source_offsets, dstT->destination_indices,
+								CUSPARSE_INDEX_BASE_ZERO,
+								dataType);
+				} else if (srcT->tag == NVGRAPH_DEFAULT || srcT->tag == NVGRAPH_UNSORTED) {
+					coou2csr(srcT->nvertices, srcT->nvertices, srcT->nedges,
+								srcEdgeData,
+								srcT->source_indices, srcT->destination_indices,
+								dstEdgeData,
+								dstT->source_offsets, dstT->destination_indices,
+								CUSPARSE_INDEX_BASE_ZERO,
+								dataType);
+				} else {
+					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+				}
+			} else if (srcTType == NVGRAPH_COO_32 && dstTType == NVGRAPH_CSC_32) {           // COO2CSC
+				nvgraphCOOTopology32I_t srcT = static_cast<nvgraphCOOTopology32I_t>(srcTopology);
+				nvgraphCSCTopology32I_t dstT = static_cast<nvgraphCSCTopology32I_t>(dstTopology);
+				dstT->nvertices = srcT->nvertices;
+				dstT->nedges = srcT->nedges;
+				if (srcT->tag == NVGRAPH_SORTED_BY_SOURCE) {
+					coos2csc(srcT->nvertices, srcT->nvertices, srcT->nedges,
+								srcEdgeData,
+								srcT->source_indices, srcT->destination_indices,
+								dstEdgeData,
+								dstT->source_indices, dstT->destination_offsets,
+								CUSPARSE_INDEX_BASE_ZERO,
+								dataType);
+				} else if (srcT->tag == NVGRAPH_SORTED_BY_DESTINATION) {
+					coo2csr(srcT->destination_indices,
+								srcT->nedges,
+								srcT->nvertices,
+								dstT->destination_offsets,
+								CUSPARSE_INDEX_BASE_ZERO);
+					CHECK_CUDA(cudaMemcpy(dstT->source_indices,
+													srcT->source_indices,
+													srcT->nedges * sizeof(int),
+													cudaMemcpyDefault));
+					CHECK_CUDA(cudaMemcpy(dstEdgeData,
+													srcEdgeData,
+													srcT->nedges * sizeT,
+													cudaMemcpyDefault));
+				} else if (srcT->tag == NVGRAPH_DEFAULT || srcT->tag == NVGRAPH_UNSORTED) {
+					coou2csc(srcT->nvertices, srcT->nvertices, srcT->nedges,
+								srcEdgeData,
+								srcT->source_indices, srcT->destination_indices,
+								dstEdgeData,
+								dstT->source_indices, dstT->destination_offsets,
+								CUSPARSE_INDEX_BASE_ZERO,
+								dataType);
+				} else {
+					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+				}
+			} else if (srcTType == NVGRAPH_COO_32 && dstTType == NVGRAPH_COO_32) {           // COO2COO
+				nvgraphCOOTopology32I_t srcT = static_cast<nvgraphCOOTopology32I_t>(srcTopology);
+				nvgraphCOOTopology32I_t dstT = static_cast<nvgraphCOOTopology32I_t>(dstTopology);
+				dstT->nvertices = srcT->nvertices;
+				dstT->nedges = srcT->nedges;
+				if (srcT->tag == dstT->tag || dstT->tag == NVGRAPH_DEFAULT
+						|| dstT->tag == NVGRAPH_UNSORTED) {
+					CHECK_CUDA(cudaMemcpy(dstT->source_indices,
+													srcT->source_indices,
+													srcT->nedges * sizeof(int),
+													cudaMemcpyDefault));
+					CHECK_CUDA(cudaMemcpy(dstT->destination_indices,
+													srcT->destination_indices,
+													srcT->nedges * sizeof(int),
+													cudaMemcpyDefault));
+					CHECK_CUDA(cudaMemcpy(dstEdgeData,
+													srcEdgeData,
+													srcT->nedges * sizeT,
+													cudaMemcpyDefault));
+				} else if (dstT->tag == NVGRAPH_SORTED_BY_SOURCE) {
+					cooSortBySource(srcT->nvertices, srcT->nvertices, srcT->nedges,
+											srcEdgeData,
+											srcT->source_indices, srcT->destination_indices,
+											dstEdgeData,
+											dstT->source_indices, dstT->destination_indices,
+											CUSPARSE_INDEX_BASE_ZERO,
+											dataType);
+				} else if (dstT->tag == NVGRAPH_SORTED_BY_DESTINATION) {
+					cooSortByDestination(srcT->nvertices, srcT->nvertices, srcT->nedges,
+												srcEdgeData,
+												srcT->source_indices, srcT->destination_indices,
+												dstEdgeData,
+												dstT->source_indices, dstT->destination_indices,
+												CUSPARSE_INDEX_BASE_ZERO,
+												dataType);
+				} else {
+					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+				}
+
+				///////////////////////////////////////////////////////////////////////////////////////////////////////////
+			} else {
+				return NVGRAPH_STATUS_INVALID_VALUE;
+			}
+
+		}
+		NVGRAPH_CATCHES(rc)
+		return getCAPIStatusForError(rc);
+	}
+
+	nvgraphStatus_t NVGRAPH_API nvgraphConvertGraph_impl(nvgraphHandle_t handle,
+																			nvgraphGraphDescr_t srcDescrG,
+																			nvgraphGraphDescr_t dstDescrG,
+																			nvgraphTopologyType_t dstTType)
+																			{
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		nvgraphStatus_t status = NVGRAPH_STATUS_SUCCESS;
+		try
+		{
+			if (check_context(handle) || check_graph(srcDescrG))  // Graph must have a topology
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+			if (dstDescrG->graphStatus != IS_EMPTY) // dst Graph must be empty
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			// graphs can only have CSR or CSC topology (EL is for storage only)
+			if (srcDescrG->TT != NVGRAPH_CSR_32 && srcDescrG->TT != NVGRAPH_CSC_32)
+				return NVGRAPH_STATUS_INTERNAL_ERROR; // invalid state, you can only create graph with CSR/CSC
+			if (dstTType != NVGRAPH_CSR_32 && dstTType != NVGRAPH_CSC_32)
+				return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; // only conversion to CSR/CSC is allowed
+
+			int nvertices, nedges;
+			int *srcOffsets = NULL, *srcIndices = NULL, *dstOffsets = NULL, *dstIndices = NULL;
+			SHARED_PREFIX::shared_ptr<int> permutation, offsets, indices;
+
+			// Step 1: get source graph structure
+			nvgraph::CsrGraph<int> *CSRG =
+					static_cast<nvgraph::CsrGraph<int> *>(srcDescrG->graph_handle);
+			nvertices = static_cast<int>(CSRG->get_num_vertices());
+			nedges = static_cast<int>(CSRG->get_num_edges());
+			srcOffsets = CSRG->get_raw_row_offsets();
+			srcIndices = CSRG->get_raw_column_indices();
+
+			// Step 2: convert topology and get permutation array.
+			if (srcDescrG->TT != dstTType) { // Otherwise conversion is not needed, only copy.
+				offsets = allocateDevice<int>(nvertices + 1, NULL);
+				indices = allocateDevice<int>(nedges, NULL);
+				permutation = allocateDevice<int>(nedges, NULL);
+				csr2cscP(nvertices, nvertices, nedges,
+							srcOffsets,
+							srcIndices,
+							indices.get(),
+							offsets.get(), permutation.get(), CUSPARSE_INDEX_BASE_ZERO);
+				dstOffsets = offsets.get();
+				dstIndices = indices.get();
+			} else {
+				dstOffsets = srcOffsets;
+				dstIndices = srcIndices;
+			}
+
+			// Step 3: Set dst graph structure
+			if (dstTType == NVGRAPH_CSR_32) {
+				nvgraphCSRTopology32I_st dstTopology;
+				dstTopology.nedges = nedges;
+				dstTopology.nvertices = nvertices;
+				dstTopology.source_offsets = dstOffsets;
+				dstTopology.destination_indices = dstIndices;
+				status = nvgraphSetGraphStructure(handle, dstDescrG, &dstTopology, dstTType);
+			} else if (dstTType == NVGRAPH_CSC_32) {
+				nvgraphCSCTopology32I_st dstTopology;
+				dstTopology.nedges = nedges;
+				dstTopology.nvertices = nvertices;
+				dstTopology.destination_offsets = dstOffsets;
+				dstTopology.source_indices = dstIndices;
+				status = nvgraphSetGraphStructure(handle, dstDescrG, &dstTopology, dstTType);
+			} else
+				return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+			if (status != NVGRAPH_STATUS_SUCCESS)
+				return NVGRAPH_STATUS_INTERNAL_ERROR;
+			offsets.reset();
+			indices.reset();
+
+			// Step 4: Allocate, convert and set edge+vertex data on the new graph
+			if (srcDescrG->graphStatus == HAS_VALUES) {
+				if (srcDescrG->T == CUDA_R_32F) {
+					nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+							static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(srcDescrG->graph_handle);
+					size_t vertexDim = MCSRG->get_num_vertex_dim();
+					size_t edgesDim = MCSRG->get_num_edge_dim();
+					// Step 4.1: allocate and set vertex data (no need for convert)
+					if (vertexDim > 0) {
+						std::vector<cudaDataType_t> vertexDataType(vertexDim);
+						std::fill(vertexDataType.begin(), vertexDataType.end(), srcDescrG->T);
+						status = nvgraphAllocateVertexData(handle,
+																		dstDescrG,
+																		vertexDim,
+																		vertexDataType.data());
+						if (status != NVGRAPH_STATUS_SUCCESS)
+							return NVGRAPH_STATUS_INTERNAL_ERROR;
+						for (size_t i = 0; i < vertexDim; ++i) {
+							void *vertexData = MCSRG->get_raw_vertex_dim(i);
+							status = nvgraphSetVertexData(handle, dstDescrG, vertexData, i);
+							if (status != NVGRAPH_STATUS_SUCCESS)
+								return NVGRAPH_STATUS_INTERNAL_ERROR;
+						}
+					}
+					// Step 4.2: allocate and set vertex data
+					if (edgesDim > 0) {
+						void *dstEdgeData = NULL;
+						SHARED_PREFIX::shared_ptr<float> dstEdgeDataSP;
+
+						std::vector<cudaDataType_t> edgeDataType(edgesDim);
+						std::fill(edgeDataType.begin(), edgeDataType.end(), srcDescrG->T);
+						status = nvgraphAllocateEdgeData(handle,
+																	dstDescrG,
+																	edgesDim,
+																	edgeDataType.data());
+						if (status != NVGRAPH_STATUS_SUCCESS)
+							return NVGRAPH_STATUS_INTERNAL_ERROR;
+						// allocate edge data memory (if there is a need)
+						if (edgesDim > 0 && srcDescrG->TT != dstTType) {
+							dstEdgeDataSP = allocateDevice<float>(nedges, NULL);
+							dstEdgeData = dstEdgeDataSP.get();
+						}
+						// Convert and set edge data (using permutation array)
+						for (size_t i = 0; i < edgesDim; ++i) {
+							void *srcEdgeData = (void*) (MCSRG->get_raw_edge_dim((int) i));
+							if (srcDescrG->TT != dstTType) // Convert using permutation array
+								gthrX(nedges,
+										srcEdgeData,
+										dstEdgeData,
+										permutation.get(),
+										CUSPARSE_INDEX_BASE_ZERO,
+										&(srcDescrG->T));
+							else
+								dstEdgeData = srcEdgeData;
+							// set edgedata
+							status = nvgraphSetEdgeData(handle, dstDescrG, dstEdgeData, i);
+							if (status != NVGRAPH_STATUS_SUCCESS)
+								return NVGRAPH_STATUS_INTERNAL_ERROR;
+						}
+					}
+				} else if (srcDescrG->T == CUDA_R_64F) {
+					nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+							static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(srcDescrG->graph_handle);
+					size_t vertexDim = MCSRG->get_num_vertex_dim();
+					size_t edgesDim = MCSRG->get_num_edge_dim();
+					// Step 4.1: allocate and set vertex data (no need for convert)
+					if (vertexDim > 0) {
+						std::vector<cudaDataType_t> vertexDataType(vertexDim);
+						std::fill(vertexDataType.begin(), vertexDataType.end(), srcDescrG->T);
+						status = nvgraphAllocateVertexData(handle,
+																		dstDescrG,
+																		vertexDim,
+																		vertexDataType.data());
+						if (status != NVGRAPH_STATUS_SUCCESS)
+							return NVGRAPH_STATUS_INTERNAL_ERROR;
+						for (size_t i = 0; i < vertexDim; ++i) {
+							void *vertexData = MCSRG->get_raw_vertex_dim(i);
+							status = nvgraphSetVertexData(handle, dstDescrG, vertexData, i);
+							if (status != NVGRAPH_STATUS_SUCCESS)
+								return NVGRAPH_STATUS_INTERNAL_ERROR;
+						}
+					}
+					// Step 4.2: allocate and set vertex data
+					if (edgesDim > 0) {
+						void *dstEdgeData = NULL;
+						SHARED_PREFIX::shared_ptr<double> dstEdgeDataSP;
+
+						std::vector<cudaDataType_t> edgeDataType(edgesDim);
+						std::fill(edgeDataType.begin(), edgeDataType.end(), srcDescrG->T);
+						status = nvgraphAllocateEdgeData(handle,
+																	dstDescrG,
+																	edgesDim,
+																	edgeDataType.data());
+						if (status != NVGRAPH_STATUS_SUCCESS)
+							return NVGRAPH_STATUS_INTERNAL_ERROR;
+						// allocate edge data memory (if there is a need)
+						if (edgesDim > 0 && srcDescrG->TT != dstTType) {
+							dstEdgeDataSP = allocateDevice<double>(nedges, NULL);
+							dstEdgeData = dstEdgeDataSP.get();
+						}
+						// Convert and set edge data (using permutation array)
+						for (size_t i = 0; i < edgesDim; ++i) {
+							void *srcEdgeData = (void*) (MCSRG->get_raw_edge_dim((int) i));
+							if (srcDescrG->TT != dstTType) // Convert using permutation array
+								gthrX(nedges,
+										srcEdgeData,
+										dstEdgeData,
+										permutation.get(),
+										CUSPARSE_INDEX_BASE_ZERO,
+										&(srcDescrG->T));
+							else
+								dstEdgeData = srcEdgeData;
+							// set edgedata
+							status = nvgraphSetEdgeData(handle, dstDescrG, dstEdgeData, i);
+							if (status != NVGRAPH_STATUS_SUCCESS)
+								return NVGRAPH_STATUS_INTERNAL_ERROR;
+						}
+					}
+				} else
+					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+			}
+		}
+		NVGRAPH_CATCHES(rc)
+		return getCAPIStatusForError(rc);
+
+	}
+
+	nvgraphStatus_t NVGRAPH_API nvgraphSetEdgeData_impl(nvgraphHandle_t handle,
+																			nvgraphGraphDescr_t descrG,
+																			void *edgeData,
+																			size_t setnum)
+																			{
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		try
+		{
+			if (check_context(handle) || check_graph(descrG) || check_int_size(setnum)
+					|| check_ptr(edgeData))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+			if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			if (descrG->T == CUDA_R_32F)
+					{
+				nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+						static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+				if (setnum >= MCSRG->get_num_edge_dim()) // base index is 0
+					return NVGRAPH_STATUS_INVALID_VALUE;
+				cudaMemcpy(MCSRG->get_raw_edge_dim(setnum),
+								(float*) edgeData,
+								(size_t) ((MCSRG->get_num_edges()) * sizeof(float)),
+								cudaMemcpyDefault);
+			}
+			else if (descrG->T == CUDA_R_64F)
+					{
+				nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+						static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+				if (setnum >= MCSRG->get_num_edge_dim()) // base index is 0
+					return NVGRAPH_STATUS_INVALID_VALUE;
+				cudaMemcpy(MCSRG->get_raw_edge_dim(setnum),
+								(double*) edgeData,
+								(size_t) ((MCSRG->get_num_edges()) * sizeof(double)),
+								cudaMemcpyDefault);
+			}
+			else if (descrG->T == CUDA_R_32I)
+					{
+				nvgraph::MultiValuedCsrGraph<int, int> *MCSRG =
+						static_cast<nvgraph::MultiValuedCsrGraph<int, int>*>(descrG->graph_handle);
+				if (setnum >= MCSRG->get_num_edge_dim()) // base index is 0
+					return NVGRAPH_STATUS_INVALID_VALUE;
+				cudaMemcpy(MCSRG->get_raw_edge_dim(setnum),
+								(int*) edgeData,
+								(size_t) ((MCSRG->get_num_edges()) * sizeof(int)),
+								cudaMemcpyDefault);
+			}
+			else
+				return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+
+			cudaCheckError()
+							;
+		}
+		NVGRAPH_CATCHES(rc)
+
+		return getCAPIStatusForError(rc);
+	}
+
+	nvgraphStatus_t NVGRAPH_API nvgraphGetEdgeData_impl(nvgraphHandle_t handle,
+																			nvgraphGraphDescr_t descrG,
+																			void *edgeData,
+																			size_t setnum)
+																			{
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		try
+		{
+			if (check_context(handle) || check_graph(descrG) || check_int_size(setnum)
+					|| check_ptr(edgeData))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+			if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			if (descrG->T == CUDA_R_32F)
+					{
+				nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+						static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+				if (setnum >= MCSRG->get_num_edge_dim()) // base index is 0
+					return NVGRAPH_STATUS_INVALID_VALUE;
+				cudaMemcpy((float*) edgeData,
+								MCSRG->get_raw_edge_dim(setnum),
+								(size_t) ((MCSRG->get_num_edges()) * sizeof(float)),
+								cudaMemcpyDefault);
+			}
+			else if (descrG->T == CUDA_R_64F)
+					{
+				nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+						static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+				if (setnum >= MCSRG->get_num_edge_dim()) // base index is 0
+					return NVGRAPH_STATUS_INVALID_VALUE;
+				cudaMemcpy((double*) edgeData,
+								MCSRG->get_raw_edge_dim(setnum),
+								(size_t) ((MCSRG->get_num_edges()) * sizeof(double)),
+								cudaMemcpyDefault);
+			}
+			else
+				return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+
+			cudaCheckError()
+							;
+
+		}
+		NVGRAPH_CATCHES(rc)
+
+		return getCAPIStatusForError(rc);
+	}
+
+	nvgraphStatus_t NVGRAPH_API nvgraphSrSpmv_impl_cub(nvgraphHandle_t handle,
+																		const nvgraphGraphDescr_t descrG,
+																		const size_t weight_index,
+																		const void *alpha,
+																		const size_t x,
+																		const void *beta,
+																		const size_t y,
+																		const nvgraphSemiring_t SR)
+																		{
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+
+		try
+		{
+			// some basic checks
+			if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+			rc = SemiringAPILauncher(handle, descrG, weight_index, alpha, x, beta, y, SR);
+		}
+		NVGRAPH_CATCHES(rc)
+
+		return getCAPIStatusForError(rc);
+	}
+
+	nvgraphStatus_t NVGRAPH_API nvgraphSssp_impl(nvgraphHandle_t handle,
+																const nvgraphGraphDescr_t descrG,
+																const size_t weight_index,
+																const int *source_vert,
+																const size_t sssp)
+																{
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		try
+		{
+			if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index)
+					|| check_int_ptr(source_vert))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+			if (descrG->TT != NVGRAPH_CSC_32) // supported topologies
+				return NVGRAPH_STATUS_INVALID_VALUE;
+//        cudaError_t cuda_status;
+
+			if (descrG->graphStatus != HAS_VALUES)
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			switch (descrG->T)
+			{
+				case CUDA_R_32F:
+					{
+					nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+							static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+					if (weight_index >= MCSRG->get_num_edge_dim() || sssp >= MCSRG->get_num_vertex_dim()) // base index is 0
+						return NVGRAPH_STATUS_INVALID_VALUE;
+
+					int n = static_cast<int>(MCSRG->get_num_vertices());
+					nvgraph::Vector<float> co(n, handle->stream);
+					nvgraph::Sssp<int, float> sssp_solver(*MCSRG->get_valued_csr_graph(weight_index));
+					nvgraph::set_connectivity<int, float>(n, *source_vert, 0.0, FLT_MAX, co.raw());
+					MCSRG->get_vertex_dim(sssp).copy(co);
+					rc = sssp_solver.solve(*source_vert, co, MCSRG->get_vertex_dim(sssp));
+					break;
+				}
+				case CUDA_R_64F:
+					{
+					nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+							static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+					if (weight_index >= MCSRG->get_num_edge_dim() || sssp >= MCSRG->get_num_vertex_dim()) // base index is 0
+						return NVGRAPH_STATUS_INVALID_VALUE;
+
+					int n = static_cast<int>(MCSRG->get_num_vertices());
+					nvgraph::Vector<double> co(n, handle->stream);
+					nvgraph::Sssp<int, double> sssp_solver(*MCSRG->get_valued_csr_graph(weight_index));
+					nvgraph::set_connectivity<int, double>(n, *source_vert, 0.0, DBL_MAX, co.raw());
+					MCSRG->get_vertex_dim(sssp).copy(co);
+					rc = sssp_solver.solve(*source_vert, co, MCSRG->get_vertex_dim(sssp));
+					break;
+				}
+				default:
+					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+			}
+		}
+		NVGRAPH_CATCHES(rc)
+
+		return getCAPIStatusForError(rc);
+	}
+
+	nvgraphStatus_t NVGRAPH_API nvgraphTraversal_impl(nvgraphHandle_t handle,
+																		const nvgraphGraphDescr_t descrG,
+																		const nvgraphTraversal_t traversalT,
+																		const int *source_vertex_ptr,
+																		const nvgraphTraversalParameter_t params)
+																		{
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		try
+		{
+			if (check_context(handle) || check_graph(descrG) || check_int_ptr(source_vertex_ptr))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+			if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph (storing results)
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			if (descrG->TT != NVGRAPH_CSR_32) // supported topologies
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			if (descrG->T != CUDA_R_32I) //results are ints
+				return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+
+			//Results (bfs distances, predecessors..) are written in dimension in mvcsrg
+			nvgraph::MultiValuedCsrGraph<int, int> *MCSRG = static_cast<nvgraph::MultiValuedCsrGraph<
+					int, int>*>(descrG->graph_handle);
+
+			//
+			//Computing traversal parameters
+			//
+
+			size_t distancesIndex, predecessorsIndex, edgeMaskIndex;
+			size_t undirectedFlagParam;
+			size_t alpha_ul, beta_ul;
+
+			int *distances = NULL, *predecessors = NULL, *edge_mask = NULL;
+
+			nvgraphTraversalGetDistancesIndex(params, &distancesIndex);
+			nvgraphTraversalGetPredecessorsIndex(params, &predecessorsIndex);
+			nvgraphTraversalGetEdgeMaskIndex(params, &edgeMaskIndex);
+			nvgraphTraversalGetUndirectedFlag(params, &undirectedFlagParam);
+			nvgraphTraversalGetAlpha(params, &alpha_ul);
+			nvgraphTraversalGetBeta(params, &beta_ul);
+
+			int alpha = static_cast<int>(alpha_ul);
+			int beta = static_cast<int>(beta_ul);
+
+			//If distances_index was set by user, then use it
+			if (distancesIndex <= MCSRG->get_num_vertex_dim()) {
+				distances = MCSRG->get_vertex_dim(distancesIndex).raw();
+			}
+
+			//If predecessors_index was set by user, then use it
+			if (predecessorsIndex <= MCSRG->get_num_vertex_dim()) {
+				predecessors = MCSRG->get_vertex_dim(predecessorsIndex).raw();
+			}
+
+			//If edgemask_index was set by user, then use it
+			if (edgeMaskIndex <= MCSRG->get_num_vertex_dim()) {
+				edge_mask = MCSRG->get_edge_dim(edgeMaskIndex).raw();
+			}
+
+			int source_vertex = *source_vertex_ptr;
+
+			int n = static_cast<int>(MCSRG->get_num_vertices());
+			int nnz = static_cast<int>(MCSRG->get_num_edges());
+			int *row_offsets = MCSRG->get_raw_row_offsets();
+			int *col_indices = MCSRG->get_raw_column_indices();
+
+			bool undirected = (bool) undirectedFlagParam;
+
+			if (source_vertex < 0 || source_vertex >= n) {
+				return NVGRAPH_STATUS_INVALID_VALUE;
+			}
+
+			//Calling corresponding implementation
+			switch (traversalT) {
+				case NVGRAPH_TRAVERSAL_BFS:
+					nvgraph::Bfs<int> bfs_solver(n,
+															nnz,
+															row_offsets,
+															col_indices,
+															!undirected,
+															alpha,
+															beta,
+															handle->stream);
+
+					//To easily implement multi source with single source,
+					//loop on those two
+					rc = bfs_solver.configure(distances, predecessors, edge_mask);
+					rc = bfs_solver.traverse(source_vertex);
+					break;
+			};
+
+		}
+		NVGRAPH_CATCHES(rc)
+
+		return getCAPIStatusForError(rc);
+	}
+
+	/**
+	 * CAPI Method for calling 2d BFS algorithm.
+	 * @param handle Nvgraph context handle.
+	 * @param descrG Graph handle (must be 2D partitioned)
+	 * @param source_vert The source vertex ID
+	 * @param distances Pointer to memory allocated to store the distances.
+	 * @param predecessors Pointer to memory allocated to store the predecessors
+	 * @return Status code.
+	 */
+	nvgraphStatus_t NVGRAPH_API nvgraph2dBfs_impl(nvgraphHandle_t handle,
+																	const nvgraphGraphDescr_t descrG,
+																	const int32_t source_vert,
+																	int32_t* distances,
+																	int32_t* predecessors) {
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		try {
+			if (check_context(handle) || check_graph(descrG))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+			if (descrG->graphStatus == IS_EMPTY)
+				return NVGRAPH_STATUS_INVALID_VALUE;
+			if (descrG->TT != NVGRAPH_2D_32I_32I)
+				return NVGRAPH_STATUS_INVALID_VALUE;
+			if (descrG->T != CUDA_R_32I)
+				return NVGRAPH_STATUS_INVALID_VALUE;
+			nvgraph::Matrix2d<int32_t, int32_t, int32_t>* m = static_cast<nvgraph::Matrix2d<int32_t,
+					int32_t, int32_t>*>(descrG->graph_handle);
+//			std::cout << m->toString();
+			nvgraph::Bfs2d<int32_t, int32_t, int32_t> bfs(m, true, 0, 0);
+			rc = bfs.configure(distances, predecessors);
+			rc = bfs.traverse(source_vert);
+		}
+		NVGRAPH_CATCHES(rc)
+
+		return getCAPIStatusForError(rc);
+	}
+
+	nvgraphStatus_t NVGRAPH_API nvgraphWidestPath_impl(nvgraphHandle_t handle,
+																		const nvgraphGraphDescr_t descrG,
+																		const size_t weight_index,
+																		const int *source_vert,
+																		const size_t widest_path)
+																		{
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		try
+		{
+			if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index)
+					|| check_int_ptr(source_vert))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+			if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			if (descrG->TT != NVGRAPH_CSC_32) // supported topologies
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+//        cudaError_t cuda_status;
+
+			switch (descrG->T)
+			{
+				case CUDA_R_32F:
+					{
+					nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+							static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+					if (weight_index >= MCSRG->get_num_edge_dim()
+							|| widest_path >= MCSRG->get_num_vertex_dim()) // base index is 0
+						return NVGRAPH_STATUS_INVALID_VALUE;
+
+					int n = static_cast<int>(MCSRG->get_num_vertices());
+					nvgraph::Vector<float> co(n, handle->stream);
+					nvgraph::WidestPath<int, float> widest_path_solver(*MCSRG->get_valued_csr_graph(weight_index));
+					nvgraph::set_connectivity<int, float>(n, *source_vert, FLT_MAX, -FLT_MAX, co.raw());
+					MCSRG->get_vertex_dim(widest_path).copy(co);
+					rc = widest_path_solver.solve(*source_vert, co, MCSRG->get_vertex_dim(widest_path));
+					break;
+				}
+				case CUDA_R_64F:
+					{
+					nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+							static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+					if (weight_index >= MCSRG->get_num_edge_dim()
+							|| widest_path >= MCSRG->get_num_vertex_dim()) // base index is 0
+						return NVGRAPH_STATUS_INVALID_VALUE;
+
+					int n = static_cast<int>(MCSRG->get_num_vertices());
+					nvgraph::Vector<double> co(n, handle->stream);
+					nvgraph::WidestPath<int, double> widest_path_solver(*MCSRG->get_valued_csr_graph(weight_index));
+					nvgraph::set_connectivity<int, double>(n, *source_vert, DBL_MAX, -DBL_MAX, co.raw());
+					MCSRG->get_vertex_dim(widest_path).copy(co);
+					rc = widest_path_solver.solve(*source_vert, co, MCSRG->get_vertex_dim(widest_path));
+					break;
+				}
+				default:
+					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+			}
+		}
+		NVGRAPH_CATCHES(rc)
+
+		return getCAPIStatusForError(rc);
+	}
+
+	nvgraphStatus_t NVGRAPH_API nvgraphPagerank_impl(nvgraphHandle_t handle,
+																		const nvgraphGraphDescr_t descrG,
+																		const size_t weight_index,
+																		const void *alpha,
+																		const size_t bookmark,
+																		const int has_guess,
+																		const size_t rank,
+																		const float tolerance,
+																		const int max_iter)
+																		{
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		try
+		{
+			if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index)
+					|| check_ptr(alpha))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+			if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			if (descrG->TT != NVGRAPH_CSC_32) // supported topologies
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			if (!(has_guess == 0 || has_guess == 1))
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			int max_it;
+			float tol;
+
+			if (max_iter > 0)
+				max_it = max_iter;
+			else
+				max_it = 500;
+
+			if (tolerance == 0.0f)
+				tol = 1.0E-6f;
+			else if (tolerance < 1.0f && tolerance > 0.0f)
+				tol = tolerance;
+			else
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			switch (descrG->T)
+			{
+				case CUDA_R_32F:
+					{
+					float alphaT = *static_cast<const float*>(alpha);
+					if (alphaT <= 0.0f || alphaT >= 1.0f)
+						return NVGRAPH_STATUS_INVALID_VALUE;
+					nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+							static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+					if (weight_index >= MCSRG->get_num_edge_dim()
+							|| bookmark >= MCSRG->get_num_vertex_dim()
+							|| rank >= MCSRG->get_num_vertex_dim()) // base index is 0
+						return NVGRAPH_STATUS_INVALID_VALUE;
+
+					int n = static_cast<int>(MCSRG->get_num_vertices());
+					nvgraph::Vector<float> guess(n, handle->stream);
+					nvgraph::Vector<float> bm(n, handle->stream);
+					if (has_guess)
+						guess.copy(MCSRG->get_vertex_dim(rank));
+					else
+						guess.fill(static_cast<float>(1.0 / n));
+					bm.copy(MCSRG->get_vertex_dim(bookmark));
+					nvgraph::Pagerank<int, float> pagerank_solver(	*MCSRG->get_valued_csr_graph(weight_index),
+																					bm);
+					rc = pagerank_solver.solve(alphaT, guess, MCSRG->get_vertex_dim(rank), tol, max_it);
+					break;
+				}
+				case CUDA_R_64F:
+					{
+					double alphaT = *static_cast<const double*>(alpha);
+					if (alphaT <= 0.0 || alphaT >= 1.0)
+						return NVGRAPH_STATUS_INVALID_VALUE;
+
+					nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+							static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+					if (weight_index >= MCSRG->get_num_edge_dim()
+							|| bookmark >= MCSRG->get_num_vertex_dim()
+							|| rank >= MCSRG->get_num_vertex_dim()) // base index is 0
+						return NVGRAPH_STATUS_INVALID_VALUE;
+
+					int n = static_cast<int>(MCSRG->get_num_vertices());
+					nvgraph::Vector<double> guess(n, handle->stream);
+					nvgraph::Vector<double> bm(n, handle->stream);
+					bm.copy(MCSRG->get_vertex_dim(bookmark));
+					if (has_guess)
+						guess.copy(MCSRG->get_vertex_dim(rank));
+					else
+						guess.fill(static_cast<float>(1.0 / n));
+					nvgraph::Pagerank<int, double> pagerank_solver(	*MCSRG->get_valued_csr_graph(weight_index),
+																					bm);
+					rc = pagerank_solver.solve(alphaT, guess, MCSRG->get_vertex_dim(rank), tol, max_it);
+					break;
+				}
+				default:
+					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+			}
+		}
+		NVGRAPH_CATCHES(rc)
+
+		return getCAPIStatusForError(rc);
+	}
+
+	nvgraphStatus_t NVGRAPH_API nvgraphKrylovPagerank_impl(nvgraphHandle_t handle,
+																				const nvgraphGraphDescr_t descrG,
+																				const size_t weight_index,
+																				const void *alpha,
+																				const size_t bookmark,
+																				const float tolerance,
+																				const int max_iter,
+																				const int subspace_size,
+																				const int has_guess,
+																				const size_t rank)
+																				{
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		try
+		{
+			if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index)
+					|| check_ptr(alpha))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+			if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			if (descrG->TT != NVGRAPH_CSC_32) // supported topologies
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+//        cudaError_t cuda_status;
+			int max_it;
+			int ss_sz;
+			float tol;
+
+			if (max_iter > 0)
+				max_it = max_iter;
+			else
+				max_it = 500;
+
+			if (subspace_size > 0)
+				ss_sz = subspace_size;
+			else
+				ss_sz = 8;
+
+			if (tolerance == 0.0f)
+				tol = 1.0E-6f;
+			else if (tolerance < 1.0f && tolerance > 0.0f)
+				tol = tolerance;
+			else
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			switch (descrG->T)
+			{
+				case CUDA_R_32F:
+					{
+					float alphaT = *static_cast<const float*>(alpha);
+					if (alphaT <= 0.0f || alphaT >= 1.0f)
+						return NVGRAPH_STATUS_INVALID_VALUE;
+					nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+							static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+					if (weight_index >= MCSRG->get_num_edge_dim()
+							|| bookmark >= MCSRG->get_num_vertex_dim()
+							|| rank >= MCSRG->get_num_vertex_dim()) // base index is 0
+						return NVGRAPH_STATUS_INVALID_VALUE;
+
+					int n = static_cast<int>(MCSRG->get_num_vertices());
+					nvgraph::Vector<float> guess(n, handle->stream), eigVals(1, handle->stream);
+					if (has_guess)
+						guess.copy(MCSRG->get_vertex_dim(rank));
+					else
+						guess.fill(static_cast<float>(1.0 / n));
+					nvgraph::ImplicitArnoldi<int, float> iram_solver(	*MCSRG->get_valued_csr_graph(weight_index),
+																						MCSRG->get_vertex_dim(bookmark),
+																						tol,
+																						max_it,
+																						alphaT);
+					rc = iram_solver.solve(ss_sz, 1, guess, eigVals, MCSRG->get_vertex_dim(rank));
+					break;
+				}
+				case CUDA_R_64F:
+					{
+					// curently iram solver accept float for alpha
+					double alphaTemp = *static_cast<const double*>(alpha);
+					float alphaT = static_cast<float>(alphaTemp);
+					if (alphaT <= 0.0f || alphaT >= 1.0f)
+						return NVGRAPH_STATUS_INVALID_VALUE;
+					nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+							static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+					if (weight_index >= MCSRG->get_num_edge_dim()
+							|| bookmark >= MCSRG->get_num_vertex_dim()
+							|| rank >= MCSRG->get_num_vertex_dim()) // base index is 0
+						return NVGRAPH_STATUS_INVALID_VALUE;
+
+					int n = static_cast<int>(MCSRG->get_num_vertices());
+					nvgraph::Vector<double> guess(n, handle->stream), eigVals(1, handle->stream);
+					if (has_guess)
+						guess.copy(MCSRG->get_vertex_dim(rank));
+					else
+						guess.fill(static_cast<float>(1.0 / n));
+					nvgraph::ImplicitArnoldi<int, double> iram_solver(	*MCSRG->get_valued_csr_graph(weight_index),
+																						MCSRG->get_vertex_dim(bookmark),
+																						tol,
+																						max_it,
+																						alphaT);
+					rc = iram_solver.solve(ss_sz, 1, guess, eigVals, MCSRG->get_vertex_dim(rank));
+					break;
+				}
+				default:
+					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+			}
+		}
+		NVGRAPH_CATCHES(rc)
+
+		return getCAPIStatusForError(rc);
+	}
+
+	nvgraphStatus_t NVGRAPH_API nvgraphExtractSubgraphByVertex_impl(nvgraphHandle_t handle,
+																							nvgraphGraphDescr_t descrG,
+																							nvgraphGraphDescr_t subdescrG,
+																							int *subvertices,
+																							size_t numvertices)
+																							{
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		typedef int IndexType;
+
+		try
+		{
+			if (check_context(handle) ||
+					check_graph(descrG) ||
+					!subdescrG ||
+					check_int_size(numvertices) ||
+					check_ptr(subvertices))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+			if (!numvertices)
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			subdescrG->TT = descrG->TT;
+			subdescrG->T = descrG->T;
+
+			switch (descrG->graphStatus)
+			{
+				case HAS_TOPOLOGY: //CsrGraph
+				{
+					nvgraph::CsrGraph<int> *CSRG =
+							static_cast<nvgraph::CsrGraph<IndexType>*>(descrG->graph_handle);
+
+					Graph<IndexType>* subgraph = extract_subgraph_by_vertices(*CSRG,
+																									subvertices,
+																									numvertices,
+																									handle->stream);
+
+					subdescrG->graph_handle = subgraph;
+					subdescrG->graphStatus = HAS_TOPOLOGY;
+				}
+					break;
+
+				case HAS_VALUES: //MultiValuedCsrGraph
+					if (descrG->T == CUDA_R_32F)
+							{
+						nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+								static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+
+						nvgraph::MultiValuedCsrGraph<int, float>* subgraph =
+								extract_subgraph_by_vertices(*MCSRG,
+																		subvertices,
+																		numvertices,
+																		handle->stream);
+
+						subdescrG->graph_handle = subgraph;
+						subdescrG->graphStatus = HAS_VALUES;
+					}
+					else if (descrG->T == CUDA_R_64F)
+							{
+						nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+								static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+
+						nvgraph::MultiValuedCsrGraph<int, double>* subgraph =
+								extract_subgraph_by_vertices(*MCSRG,
+																		subvertices,
+																		numvertices,
+																		handle->stream);
+
+						subdescrG->graph_handle = subgraph;
+						subdescrG->graphStatus = HAS_VALUES;
+					}
+					else
+						return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+					break;
+
+				default:
+					return NVGRAPH_STATUS_INVALID_VALUE;
+			}
+		}
+		NVGRAPH_CATCHES(rc)
+
+		return getCAPIStatusForError(rc);
+	}
+
+	nvgraphStatus_t NVGRAPH_API nvgraphExtractSubgraphByEdge_impl(nvgraphHandle_t handle,
+																						nvgraphGraphDescr_t descrG,
+																						nvgraphGraphDescr_t subdescrG,
+																						int *subedges,
+																						size_t numedges)
+																						{
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		//TODO: extract handle->stream info, from handler/nvgraphContext (?)
+		typedef int IndexType;
+
+		try
+		{
+			if (check_context(handle) ||
+					check_graph(descrG) ||
+					!subdescrG ||
+					check_int_size(numedges) ||
+					check_ptr(subedges))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+			if (!numedges)
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			subdescrG->TT = descrG->TT;
+			subdescrG->T = descrG->T;
+
+			switch (descrG->graphStatus)
+			{
+				case HAS_TOPOLOGY: //CsrGraph
+				{
+					nvgraph::CsrGraph<int> *CSRG =
+							static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
+					Graph<IndexType>* subgraph = extract_subgraph_by_edges(*CSRG,
+																								subedges,
+																								numedges,
+																								handle->stream);
+
+					subdescrG->graph_handle = subgraph;
+					subdescrG->graphStatus = HAS_TOPOLOGY;
+				}
+					break;
+
+				case HAS_VALUES: //MultiValuedCsrGraph
+					if (descrG->T == CUDA_R_32F)
+							{
+						nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+								static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+
+						nvgraph::MultiValuedCsrGraph<int, float>* subgraph =
+								extract_subgraph_by_edges(*MCSRG, subedges, numedges, handle->stream);
+
+						subdescrG->graph_handle = subgraph;
+						subdescrG->graphStatus = HAS_VALUES;
+					}
+					else if (descrG->T == CUDA_R_64F)
+							{
+						nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+								static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+
+						nvgraph::MultiValuedCsrGraph<int, double>* subgraph =
+								extract_subgraph_by_edges(*MCSRG, subedges, numedges, handle->stream);
+
+						subdescrG->graph_handle = subgraph;
+						subdescrG->graphStatus = HAS_VALUES;
+					}
+					else
+						return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+					break;
+
+				default:
+					return NVGRAPH_STATUS_INVALID_VALUE;
+			}
+		}
+		NVGRAPH_CATCHES(rc)
+
+		return getCAPIStatusForError(rc);
+	}
+
+	nvgraphStatus_t NVGRAPH_API nvgraphBalancedCutClustering_impl(nvgraphHandle_t handle,
+																						const nvgraphGraphDescr_t descrG,
+																						const size_t weight_index,
+																						const int n_clusters,
+																						const int n_eig_vects,
+																						const int evs_type,
+																						const float evs_tolerance,
+																						const int evs_max_iter,
+																						const float kmean_tolerance,
+																						const int kmean_max_iter,
+																						int* clustering,
+																						void* eig_vals,
+																						void* eig_vects)
+																						{
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		try
+		{
+			if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+			if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			if (descrG->TT != NVGRAPH_CSR_32) // supported topologies
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			int evs_max_it, kmean_max_it;
+			int iters_lanczos, iters_kmeans;
+			float evs_tol, kmean_tol;
+
+			if (evs_max_iter > 0)
+				evs_max_it = evs_max_iter;
+			else
+				evs_max_it = 4000;
+
+			if (evs_tolerance == 0.0f)
+				evs_tol = 1.0E-3f;
+			else if (evs_tolerance < 1.0f && evs_tolerance > 0.0f)
+				evs_tol = evs_tolerance;
+			else
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			if (kmean_max_iter > 0)
+				kmean_max_it = kmean_max_iter;
+			else
+				kmean_max_it = 200;
+
+			if (kmean_tolerance == 0.0f)
+				kmean_tol = 1.0E-2f;
+			else if (kmean_tolerance < 1.0f && kmean_tolerance > 0.0f)
+				kmean_tol = kmean_tolerance;
+			else
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			if (n_clusters < 2)
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			if (n_eig_vects > n_clusters)
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			if (!(evs_type == 0 || evs_type == 1))
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			if (clustering == NULL || eig_vals == NULL || eig_vects == NULL)
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			switch (descrG->T)
+			{
+				case CUDA_R_32F:
+					{
+					nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+							static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+					if (weight_index >= MCSRG->get_num_edge_dim()
+							|| n_clusters > static_cast<int>(MCSRG->get_num_vertices())) // base index is 0
+						return NVGRAPH_STATUS_INVALID_VALUE;
+					nvgraph::ValuedCsrGraph<int, float> network =
+							*MCSRG->get_valued_csr_graph(weight_index);
+					Vector<int> clust(MCSRG->get_num_vertices(), handle->stream);
+					Vector<float> eigVals(n_eig_vects, handle->stream);
+					Vector<float> eigVecs(MCSRG->get_num_vertices() * n_eig_vects, handle->stream);
+
+					if (evs_type == 0)
+							{
+						int restartIter_lanczos = 15 + n_eig_vects;
+						rc = partition<int, float>(network,
+															n_clusters,
+															n_eig_vects,
+															evs_max_it,
+															restartIter_lanczos,
+															evs_tol,
+															kmean_max_it,
+															kmean_tol,
+															clust.raw(),
+															eigVals,
+															eigVecs,
+															iters_lanczos,
+															iters_kmeans);
+					}
+					else
+					{
+						cusolverDnHandle_t cusolverHandle;
+						cusolverDnCreate(&cusolverHandle);
+						rc = partition_lobpcg<int, float>(network,
+						NULL, // preconditioner
+																		cusolverHandle,
+																		n_clusters,
+																		n_eig_vects,
+																		evs_max_it,
+																		evs_tol,
+																		kmean_max_it,
+																		kmean_tol,
+																		clust.raw(),
+																		eigVals,
+																		eigVecs,
+																		iters_lanczos,
+																		iters_kmeans);
+					}
+					// give a copy of results to the user
+					if (rc == NVGRAPH_OK)
+							{
+						CHECK_CUDA(cudaMemcpy((int* )clustering,
+														clust.raw(),
+														(size_t )(MCSRG->get_num_vertices() * sizeof(int)),
+														cudaMemcpyDefault));
+						CHECK_CUDA(cudaMemcpy((float* )eig_vals,
+														eigVals.raw(),
+														(size_t )(n_eig_vects * sizeof(float)),
+														cudaMemcpyDefault));
+						CHECK_CUDA(cudaMemcpy((float* )eig_vects,
+														eigVecs.raw(),
+														(size_t )(n_eig_vects * MCSRG->get_num_vertices()
+																* sizeof(float)),
+														cudaMemcpyDefault));
+					}
+
+					break;
+				}
+				case CUDA_R_64F:
+					{
+					nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+							static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+					if (weight_index >= MCSRG->get_num_edge_dim()
+							|| n_clusters > static_cast<int>(MCSRG->get_num_vertices())) // base index is 0
+						return NVGRAPH_STATUS_INVALID_VALUE;
+					nvgraph::ValuedCsrGraph<int, double> network =
+							*MCSRG->get_valued_csr_graph(weight_index);
+					Vector<int> clust(MCSRG->get_num_vertices(), handle->stream);
+					Vector<double> eigVals(n_eig_vects, handle->stream);
+					Vector<double> eigVecs(MCSRG->get_num_vertices() * n_eig_vects, handle->stream);
+					if (evs_type == 0)
+							{
+						int restartIter_lanczos = 15 + n_eig_vects;
+						rc = partition<int, double>(network,
+																n_clusters,
+																n_eig_vects,
+																evs_max_it,
+																restartIter_lanczos,
+																evs_tol,
+																kmean_max_it,
+																kmean_tol,
+																clust.raw(),
+																eigVals,
+																eigVecs,
+																iters_lanczos,
+																iters_kmeans);
+					}
+					else
+					{
+						cusolverDnHandle_t cusolverHandle;
+						cusolverDnCreate(&cusolverHandle);
+						rc = partition_lobpcg<int, double>(network,
+						NULL, // preconditioner
+																		cusolverHandle,
+																		n_clusters,
+																		n_eig_vects,
+																		evs_max_it,
+																		evs_tol,
+																		kmean_max_it,
+																		kmean_tol,
+																		clust.raw(),
+																		eigVals,
+																		eigVecs,
+																		iters_lanczos,
+																		iters_kmeans);
+					}
+					// give a copy of results to the user
+					if (rc == NVGRAPH_OK)
+							{
+						CHECK_CUDA(cudaMemcpy((int* )clustering,
+														clust.raw(),
+														(size_t )(MCSRG->get_num_vertices() * sizeof(int)),
+														cudaMemcpyDefault));
+						CHECK_CUDA(cudaMemcpy((double* )eig_vals,
+														eigVals.raw(),
+														(size_t )(n_eig_vects * sizeof(double)),
+														cudaMemcpyDefault));
+						CHECK_CUDA(cudaMemcpy((double* )eig_vects,
+														eigVecs.raw(),
+														(size_t )(n_eig_vects * MCSRG->get_num_vertices()
+																* sizeof(double)),
+														cudaMemcpyDefault));
+					}
+					break;
+				}
+				default:
+					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+			}
+		}
+		NVGRAPH_CATCHES(rc)
+		return getCAPIStatusForError(rc);
+	}
+
+	nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeBalancedCut_impl(nvgraphHandle_t handle,
+																					const nvgraphGraphDescr_t descrG,
+																					const size_t weight_index,
+																					const int n_clusters,
+																					const int* clustering,
+																					float * edgeCut,
+																					float * ratioCut)
+																					{
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		try
+		{
+			if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+			if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			if (descrG->TT != NVGRAPH_CSR_32) // supported topologies
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			if (n_clusters < 2)
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			if (clustering == NULL || edgeCut == NULL || ratioCut == NULL)
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			switch (descrG->T)
+			{
+				case CUDA_R_32F:
+					{
+					float edge_cut, ratio_cut;
+					nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+							static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+					if (weight_index >= MCSRG->get_num_edge_dim()
+							|| n_clusters > static_cast<int>(MCSRG->get_num_vertices()))
+						return NVGRAPH_STATUS_INVALID_VALUE;
+					nvgraph::ValuedCsrGraph<int, float> network =
+							*MCSRG->get_valued_csr_graph(weight_index);
+					Vector<int> clust(MCSRG->get_num_vertices(), handle->stream);
+					CHECK_CUDA(cudaMemcpy(clust.raw(),
+													(int* )clustering,
+													(size_t )(MCSRG->get_num_vertices() * sizeof(int)),
+													cudaMemcpyDefault));
+					rc = analyzePartition<int, float>(network,
+																	n_clusters,
+																	clust.raw(),
+																	edge_cut,
+																	ratio_cut);
+					*edgeCut = edge_cut;
+					*ratioCut = ratio_cut;
+					break;
+				}
+				case CUDA_R_64F:
+					{
+					double edge_cut, ratio_cut;
+					nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+							static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+					if (weight_index >= MCSRG->get_num_edge_dim()
+							|| n_clusters > static_cast<int>(MCSRG->get_num_vertices())) // base index is 0
+						return NVGRAPH_STATUS_INVALID_VALUE;
+					nvgraph::ValuedCsrGraph<int, double> network =
+							*MCSRG->get_valued_csr_graph(weight_index);
+					Vector<int> clust(MCSRG->get_num_vertices(), handle->stream);
+					CHECK_CUDA(cudaMemcpy(clust.raw(),
+													(int* )clustering,
+													(size_t )(MCSRG->get_num_vertices() * sizeof(int)),
+													cudaMemcpyDefault));
+					rc = analyzePartition<int, double>(network,
+																	n_clusters,
+																	clust.raw(),
+																	edge_cut,
+																	ratio_cut);
+					*edgeCut = static_cast<float>(edge_cut);
+					*ratioCut = static_cast<float>(ratio_cut);
+					break;
+				}
+
+				default:
+					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+			}
+		}
+		NVGRAPH_CATCHES(rc)
+		return getCAPIStatusForError(rc);
+
+	}
+
+	nvgraphStatus_t NVGRAPH_API nvgraphHeavyEdgeMatching_impl(	nvgraphHandle_t handle,
+																					const nvgraphGraphDescr_t descrG,
+																					const size_t weight_index,
+																					const nvgraphEdgeWeightMatching_t similarity_metric,
+																					int* aggregates,
+																					size_t* num_aggregates)
+																					{
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		try
+		{
+			if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+			if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			if (descrG->TT != NVGRAPH_CSR_32) // supported topologies
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			if (aggregates == NULL)
+				return NVGRAPH_STATUS_INVALID_VALUE;
+			Matching_t sim_metric;
+			switch (similarity_metric)
+			{
+				case NVGRAPH_UNSCALED: {
+					sim_metric = USER_PROVIDED;
+					break;
+				}
+				case NVGRAPH_SCALED_BY_ROW_SUM: {
+					sim_metric = SCALED_BY_ROW_SUM;
+					break;
+				}
+				case NVGRAPH_SCALED_BY_DIAGONAL: {
+					sim_metric = SCALED_BY_DIAGONAL;
+					break;
+				}
+				default:
+					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+			}
+
+			switch (descrG->T)
+			{
+				case CUDA_R_32F:
+					{
+					nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+							static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+					if (weight_index >= MCSRG->get_num_edge_dim())
+						return NVGRAPH_STATUS_INVALID_VALUE;
+					nvgraph::ValuedCsrGraph<int, float> network =
+							*MCSRG->get_valued_csr_graph(weight_index);
+					Vector<int> agg(MCSRG->get_num_vertices(), handle->stream);
+					int num_agg = 0;
+					nvgraph::Size2Selector<int, float> one_phase_hand_checking(sim_metric);
+					rc = one_phase_hand_checking.setAggregates(network, agg, num_agg);
+					*num_aggregates = static_cast<size_t>(num_agg);
+					CHECK_CUDA(cudaMemcpy((int* )aggregates,
+													agg.raw(),
+													(size_t )(MCSRG->get_num_vertices() * sizeof(int)),
+													cudaMemcpyDefault));
+					break;
+				}
+				case CUDA_R_64F:
+					{
+					nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+							static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+					if (weight_index >= MCSRG->get_num_edge_dim())
+						return NVGRAPH_STATUS_INVALID_VALUE;
+					nvgraph::ValuedCsrGraph<int, double> network =
+							*MCSRG->get_valued_csr_graph(weight_index);
+					Vector<int> agg(MCSRG->get_num_vertices(), handle->stream);
+					Vector<int> agg_global(MCSRG->get_num_vertices(), handle->stream);
+					int num_agg = 0;
+					nvgraph::Size2Selector<int, double> one_phase_hand_checking(sim_metric);
+					rc = one_phase_hand_checking.setAggregates(network, agg, num_agg);
+					*num_aggregates = static_cast<size_t>(num_agg);
+					CHECK_CUDA(cudaMemcpy((int* )aggregates,
+													agg.raw(),
+													(size_t )(MCSRG->get_num_vertices() * sizeof(int)),
+													cudaMemcpyDefault));
+					break;
+				}
+				default:
+					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+			}
+		}
+		NVGRAPH_CATCHES(rc)
+		return getCAPIStatusForError(rc);
+
+	}
+
+	nvgraphStatus_t NVGRAPH_API nvgraphSpectralModularityMaximization_impl(	nvgraphHandle_t handle,
+																									const nvgraphGraphDescr_t descrG,
+																									const size_t weight_index,
+																									const int n_clusters,
+																									const int n_eig_vects,
+																									const float evs_tolerance,
+																									const int evs_max_iter,
+																									const float kmean_tolerance,
+																									const int kmean_max_iter,
+																									int* clustering,
+																									void* eig_vals,
+																									void* eig_vects)
+																									{
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		try
+		{
+			if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+			if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			if (descrG->TT != NVGRAPH_CSR_32) // supported topologies
+				return NVGRAPH_STATUS_GRAPH_TYPE_NOT_SUPPORTED;
+
+			int evs_max_it, kmean_max_it;
+			int iters_lanczos, iters_kmeans;
+			float evs_tol, kmean_tol;
+
+			if (evs_max_iter > 0)
+				evs_max_it = evs_max_iter;
+			else
+				evs_max_it = 4000;
+
+			if (evs_tolerance == 0.0f)
+				evs_tol = 1.0E-3f;
+			else if (evs_tolerance < 1.0f && evs_tolerance > 0.0f)
+				evs_tol = evs_tolerance;
+			else
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			if (kmean_max_iter > 0)
+				kmean_max_it = kmean_max_iter;
+			else
+				kmean_max_it = 200;
+
+			if (kmean_tolerance == 0.0f)
+				kmean_tol = 1.0E-2f;
+			else if (kmean_tolerance < 1.0f && kmean_tolerance > 0.0f)
+				kmean_tol = kmean_tolerance;
+			else
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			if (n_clusters < 2)
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			if (n_eig_vects > n_clusters)
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			if (clustering == NULL || eig_vals == NULL || eig_vects == NULL)
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			switch (descrG->T)
+			{
+				case CUDA_R_32F:
+					{
+					nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+							static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+					if (weight_index >= MCSRG->get_num_edge_dim()
+							|| n_clusters > static_cast<int>(MCSRG->get_num_vertices())) // base index is 0
+						return NVGRAPH_STATUS_INVALID_VALUE;
+					nvgraph::ValuedCsrGraph<int, float> network =
+							*MCSRG->get_valued_csr_graph(weight_index);
+					Vector<int> clust(MCSRG->get_num_vertices(), handle->stream);
+					Vector<float> eigVals(n_eig_vects, handle->stream);
+					Vector<float> eigVecs(MCSRG->get_num_vertices() * n_eig_vects, handle->stream);
+					int restartIter_lanczos = 15 + n_eig_vects;
+					rc = modularity_maximization<int, float>(network,
+																			n_clusters,
+																			n_eig_vects,
+																			evs_max_it,
+																			restartIter_lanczos,
+																			evs_tol,
+																			kmean_max_it,
+																			kmean_tol,
+																			clust.raw(),
+																			eigVals,
+																			eigVecs,
+																			iters_lanczos,
+																			iters_kmeans);
+
+					// give a copy of results to the user
+					if (rc == NVGRAPH_OK)
+							{
+						CHECK_CUDA(cudaMemcpy((int* )clustering,
+														clust.raw(),
+														(size_t )(MCSRG->get_num_vertices() * sizeof(int)),
+														cudaMemcpyDefault));
+						CHECK_CUDA(cudaMemcpy((float* )eig_vals,
+														eigVals.raw(),
+														(size_t )(n_eig_vects * sizeof(float)),
+														cudaMemcpyDefault));
+						CHECK_CUDA(cudaMemcpy((float* )eig_vects,
+														eigVecs.raw(),
+														(size_t )(n_eig_vects * MCSRG->get_num_vertices()
+																* sizeof(float)),
+														cudaMemcpyDefault));
+					}
+
+					break;
+				}
+				case CUDA_R_64F:
+					{
+					nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+							static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+					if (weight_index >= MCSRG->get_num_edge_dim()
+							|| n_clusters > static_cast<int>(MCSRG->get_num_vertices())) // base index is 0
+						return NVGRAPH_STATUS_INVALID_VALUE;
+					nvgraph::ValuedCsrGraph<int, double> network =
+							*MCSRG->get_valued_csr_graph(weight_index);
+					Vector<int> clust(MCSRG->get_num_vertices(), handle->stream);
+					Vector<double> eigVals(n_eig_vects, handle->stream);
+					Vector<double> eigVecs(MCSRG->get_num_vertices() * n_eig_vects, handle->stream);
+					int restartIter_lanczos = 15 + n_eig_vects;
+					rc = modularity_maximization<int, double>(network,
+																			n_clusters,
+																			n_eig_vects,
+																			evs_max_it,
+																			restartIter_lanczos,
+																			evs_tol,
+																			kmean_max_it,
+																			kmean_tol,
+																			clust.raw(),
+																			eigVals,
+																			eigVecs,
+																			iters_lanczos,
+																			iters_kmeans);
+					// give a copy of results to the user
+					if (rc == NVGRAPH_OK)
+							{
+						CHECK_CUDA(cudaMemcpy((int* )clustering,
+														clust.raw(),
+														(size_t )(MCSRG->get_num_vertices() * sizeof(int)),
+														cudaMemcpyDefault));
+						CHECK_CUDA(cudaMemcpy((double* )eig_vals,
+														eigVals.raw(),
+														(size_t )(n_eig_vects * sizeof(double)),
+														cudaMemcpyDefault));
+						CHECK_CUDA(cudaMemcpy((double* )eig_vects,
+														eigVecs.raw(),
+														(size_t )(n_eig_vects * MCSRG->get_num_vertices()
+																* sizeof(double)),
+														cudaMemcpyDefault));
+					}
+					break;
+				}
+				default:
+					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+			}
+		}
+		NVGRAPH_CATCHES(rc)
+		return getCAPIStatusForError(rc);
+	}
+
+	nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeModularityClustering_impl(	nvgraphHandle_t handle,
+																								const nvgraphGraphDescr_t descrG,
+																								const size_t weight_index,
+																								const int n_clusters,
+																								const int* clustering,
+																								float * modularity)
+																								{
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		try
+		{
+			if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+			if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			if (descrG->TT != NVGRAPH_CSR_32) // supported topologies
+				return NVGRAPH_STATUS_GRAPH_TYPE_NOT_SUPPORTED;
+
+			if (n_clusters < 2)
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			if (clustering == NULL || modularity == NULL)
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			switch (descrG->T)
+			{
+				case CUDA_R_32F:
+					{
+					float mod;
+					nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+							static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+					if (weight_index >= MCSRG->get_num_edge_dim()
+							|| n_clusters > static_cast<int>(MCSRG->get_num_vertices()))
+						return NVGRAPH_STATUS_INVALID_VALUE;
+					nvgraph::ValuedCsrGraph<int, float> network =
+							*MCSRG->get_valued_csr_graph(weight_index);
+					Vector<int> clust(MCSRG->get_num_vertices(), handle->stream);
+					CHECK_CUDA(cudaMemcpy(clust.raw(),
+													(int* )clustering,
+													(size_t )(MCSRG->get_num_vertices() * sizeof(int)),
+													cudaMemcpyDefault));
+					rc = analyzeModularity<int, float>(network,
+																	n_clusters,
+																	clust.raw(),
+																	mod);
+					*modularity = mod;
+					break;
+				}
+				case CUDA_R_64F:
+					{
+					double mod;
+					nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+							static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+					if (weight_index >= MCSRG->get_num_edge_dim()
+							|| n_clusters > static_cast<int>(MCSRG->get_num_vertices())) // base index is 0
+						return NVGRAPH_STATUS_INVALID_VALUE;
+					Vector<int> clust(MCSRG->get_num_vertices(), handle->stream);
+					CHECK_CUDA(cudaMemcpy(clust.raw(),
+													(int* )clustering,
+													(size_t )(MCSRG->get_num_vertices() * sizeof(int)),
+													cudaMemcpyDefault));
+					nvgraph::ValuedCsrGraph<int, double> network =
+							*MCSRG->get_valued_csr_graph(weight_index);
+					rc = analyzeModularity<int, double>(network,
+																	n_clusters,
+																	clust.raw(),
+																	mod);
+					*modularity = static_cast<float>(mod);
+					break;
+				}
+
+				default:
+					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+			}
+		}
+		NVGRAPH_CATCHES(rc)
+		return getCAPIStatusForError(rc);
+	}
+#ifndef NVGRAPH_LIGHT
+	nvgraphStatus_t NVGRAPH_API nvgraphContractGraph_impl(nvgraphHandle_t handle,
+																			nvgraphGraphDescr_t descrG,
+																			nvgraphGraphDescr_t contrdescrG,
+																			int *aggregates,
+																			size_t numaggregates,
+																			nvgraphSemiringOps_t VertexCombineOp,
+																			nvgraphSemiringOps_t VertexReduceOp,
+																			nvgraphSemiringOps_t EdgeCombineOp,
+																			nvgraphSemiringOps_t EdgeReduceOp,
+																			int flag) //unused, for now
+																			{
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		typedef int IndexType;
+
+		try {
+			if (check_context(handle) ||
+					check_graph(descrG) ||
+					!contrdescrG ||
+					check_int_size(numaggregates) ||
+					check_ptr(aggregates))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+			contrdescrG->TT = descrG->TT;
+			contrdescrG->T = descrG->T;
+
+			switch (descrG->graphStatus)
+			{
+				case HAS_TOPOLOGY: //CsrGraph
+				{
+					nvgraph::CsrGraph<int> *CSRG =
+							static_cast<nvgraph::CsrGraph<IndexType>*>(descrG->graph_handle);
+
+					Graph<IndexType>* contracted_graph = NULL;
+
+					switch (VertexCombineOp)
+					{
+						case NVGRAPH_MULTIPLY:
+							contracted_graph = contract_graph_csr_mul(*CSRG,
+																					aggregates,
+																					numaggregates,
+																					handle->stream,
+																					VertexCombineOp,
+																					VertexReduceOp,
+																					EdgeCombineOp,
+																					EdgeReduceOp);
+							break;
+						case NVGRAPH_SUM:
+							contracted_graph = contract_graph_csr_sum(*CSRG,
+																					aggregates,
+																					numaggregates,
+																					handle->stream,
+																					VertexCombineOp,
+																					VertexReduceOp,
+																					EdgeCombineOp,
+																					EdgeReduceOp);
+							break;
+						case NVGRAPH_MIN:
+							contracted_graph = contract_graph_csr_min(*CSRG,
+																					aggregates,
+																					numaggregates,
+																					handle->stream,
+																					VertexCombineOp,
+																					VertexReduceOp,
+																					EdgeCombineOp,
+																					EdgeReduceOp);
+							break;
+						case NVGRAPH_MAX:
+							contracted_graph = contract_graph_csr_max(*CSRG,
+																					aggregates,
+																					numaggregates,
+																					handle->stream,
+																					VertexCombineOp,
+																					VertexReduceOp,
+																					EdgeCombineOp,
+																					EdgeReduceOp);
+							break;
+					}
+
+					contrdescrG->graph_handle = contracted_graph;
+					contrdescrG->graphStatus = HAS_TOPOLOGY;
+				}
+					break;
+
+				case HAS_VALUES: //MultiValuedCsrGraph
+					if (descrG->T == CUDA_R_32F)
+							{
+						nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+								static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+						nvgraph::MultiValuedCsrGraph<int, float>* contracted_graph = NULL;
+
+						switch (VertexCombineOp)
+						{
+							case NVGRAPH_MULTIPLY:
+								contracted_graph = contract_graph_mv_float_mul(*MCSRG,
+																								aggregates,
+																								numaggregates,
+																								handle->stream,
+																								VertexCombineOp,
+																								VertexReduceOp,
+																								EdgeCombineOp,
+																								EdgeReduceOp);
+								break;
+							case NVGRAPH_SUM:
+								contracted_graph = contract_graph_mv_float_sum(*MCSRG,
+																								aggregates,
+																								numaggregates,
+																								handle->stream,
+																								VertexCombineOp,
+																								VertexReduceOp,
+																								EdgeCombineOp,
+																								EdgeReduceOp);
+								break;
+							case NVGRAPH_MIN:
+								contracted_graph = contract_graph_mv_float_min(*MCSRG,
+																								aggregates,
+																								numaggregates,
+																								handle->stream,
+																								VertexCombineOp,
+																								VertexReduceOp,
+																								EdgeCombineOp,
+																								EdgeReduceOp);
+								break;
+							case NVGRAPH_MAX:
+								contracted_graph = contract_graph_mv_float_max(*MCSRG,
+																								aggregates,
+																								numaggregates,
+																								handle->stream,
+																								VertexCombineOp,
+																								VertexReduceOp,
+																								EdgeCombineOp,
+																								EdgeReduceOp);
+								break;
+						}
+
+						contrdescrG->graph_handle = contracted_graph;
+						contrdescrG->graphStatus = HAS_VALUES;
+					}
+					else if (descrG->T == CUDA_R_64F)
+							{
+						nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+								static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+
+						nvgraph::MultiValuedCsrGraph<int, double>* contracted_graph = NULL;
+
+						switch (VertexCombineOp)
+						{
+							case NVGRAPH_MULTIPLY:
+								contracted_graph = contract_graph_mv_double_mul(*MCSRG,
+																								aggregates,
+																								numaggregates,
+																								handle->stream,
+																								VertexCombineOp,
+																								VertexReduceOp,
+																								EdgeCombineOp,
+																								EdgeReduceOp);
+								break;
+							case NVGRAPH_SUM:
+								contracted_graph = contract_graph_mv_double_sum(*MCSRG,
+																								aggregates,
+																								numaggregates,
+																								handle->stream,
+																								VertexCombineOp,
+																								VertexReduceOp,
+																								EdgeCombineOp,
+																								EdgeReduceOp);
+								break;
+							case NVGRAPH_MIN:
+								contracted_graph = contract_graph_mv_double_min(*MCSRG,
+																								aggregates,
+																								numaggregates,
+																								handle->stream,
+																								VertexCombineOp,
+																								VertexReduceOp,
+																								EdgeCombineOp,
+																								EdgeReduceOp);
+								break;
+							case NVGRAPH_MAX:
+								contracted_graph = contract_graph_mv_double_max(*MCSRG,
+																								aggregates,
+																								numaggregates,
+																								handle->stream,
+																								VertexCombineOp,
+																								VertexReduceOp,
+																								EdgeCombineOp,
+																								EdgeReduceOp);
+								break;
+						}
+
+						contrdescrG->graph_handle = contracted_graph;
+						contrdescrG->graphStatus = HAS_VALUES;
+					}
+					else
+						return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+					break;
+
+				default:
+					return NVGRAPH_STATUS_INVALID_VALUE;
+			}
+
+		}
+		NVGRAPH_CATCHES(rc)
+
+		return getCAPIStatusForError(rc);
+	}
+#endif
+	
+	nvgraphStatus_t NVGRAPH_API nvgraphSpectralClustering_impl(nvgraphHandle_t handle, // nvGRAPH library handle.
+																					const nvgraphGraphDescr_t descrG, // nvGRAPH graph descriptor, should contain the connectivity information in NVGRAPH_CSR_32 or NVGRAPH_CSR_32 at least 1 edge set (weights)
+																					const size_t weight_index, // Index of the edge set for the weights.
+																					const struct SpectralClusteringParameter *params, //parameters, see struct SpectralClusteringParameter
+																					int* clustering, // (output) clustering
+																					void* eig_vals, // (output) eigenvalues
+																					void* eig_vects) // (output) eigenvectors
+																					{
+		if (check_ptr(params) || check_ptr(clustering) || check_ptr(eig_vals) || check_ptr(eig_vects))
+			FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+		if (params->algorithm == NVGRAPH_MODULARITY_MAXIMIZATION)
+			return nvgraph::nvgraphSpectralModularityMaximization_impl(handle,
+																							descrG,
+																							weight_index,
+																							params->n_clusters,
+																							params->n_eig_vects,
+																							params->evs_tolerance,
+																							params->evs_max_iter,
+																							params->kmean_tolerance,
+																							params->kmean_max_iter,
+																							clustering,
+																							eig_vals,
+																							eig_vects);
+		else if (params->algorithm == NVGRAPH_BALANCED_CUT_LANCZOS)
+			return nvgraph::nvgraphBalancedCutClustering_impl(handle,
+																				descrG,
+																				weight_index,
+																				params->n_clusters,
+																				params->n_eig_vects,
+																				0,
+																				params->evs_tolerance,
+																				params->evs_max_iter,
+																				params->kmean_tolerance,
+																				params->kmean_max_iter,
+																				clustering,
+																				eig_vals,
+																				eig_vects);
+		else if (params->algorithm == NVGRAPH_BALANCED_CUT_LOBPCG)
+			return nvgraph::nvgraphBalancedCutClustering_impl(handle,
+																				descrG,
+																				weight_index,
+																				params->n_clusters,
+																				params->n_eig_vects,
+																				1,
+																				params->evs_tolerance,
+																				params->evs_max_iter,
+																				params->kmean_tolerance,
+																				params->kmean_max_iter,
+																				clustering,
+																				eig_vals,
+																				eig_vects);
+		else
+			return NVGRAPH_STATUS_INVALID_VALUE;
+	}
+
+	nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeClustering_impl(nvgraphHandle_t handle, // nvGRAPH library handle.
+																					const nvgraphGraphDescr_t descrG, // nvGRAPH graph descriptor, should contain the connectivity information in NVGRAPH_CSR_32 at least 1 edge set (weights)
+																					const size_t weight_index, // Index of the edge set for the weights.
+																					const int n_clusters, //number of clusters
+																					const int* clustering, // clustering to analyse
+																					nvgraphClusteringMetric_t metric, // metric to compute to measure the clustering quality
+																					float * score) // (output) clustering score telling how good the clustering is for the selected metric.
+																					{
+		if (check_ptr(clustering) || check_ptr(score))
+			FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+		if (metric == NVGRAPH_MODULARITY)
+			return nvgraphAnalyzeModularityClustering_impl(handle,
+																			descrG,
+																			weight_index,
+																			n_clusters,
+																			clustering,
+																			score);
+		else if (metric == NVGRAPH_EDGE_CUT)
+				{
+			float dummy = 0;
+			return nvgraph::nvgraphAnalyzeBalancedCut_impl(handle,
+																			descrG,
+																			weight_index,
+																			n_clusters,
+																			clustering,
+																			score,
+																			&dummy);
+		}
+		else if (metric == NVGRAPH_RATIO_CUT)
+				{
+			float dummy = 0;
+			return nvgraph::nvgraphAnalyzeBalancedCut_impl(handle,
+																			descrG,
+																			weight_index,
+																			n_clusters,
+																			clustering,
+																			&dummy,
+																			score);
+		}
+		else
+			return NVGRAPH_STATUS_INVALID_VALUE;
+	}
+
+	nvgraphStatus_t NVGRAPH_API nvgraphTriangleCount_impl(nvgraphHandle_t handle,
+																			const nvgraphGraphDescr_t descrG,
+																			uint64_t* result)
+																			{
+		NVGRAPH_ERROR rc = NVGRAPH_OK;
+		try
+		{
+			if (check_context(handle) || check_graph(descrG) || check_ptr(result))
+				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+			if (descrG->TT != NVGRAPH_CSR_32 && descrG->TT != NVGRAPH_CSC_32) // supported topologies
+				return NVGRAPH_STATUS_INVALID_VALUE;
+
+			if (!(descrG->graphStatus & HAS_TOPOLOGY))
+			{
+				return NVGRAPH_STATUS_INVALID_VALUE; // should have topology
+			}
+
+			nvgraph::CsrGraph<int> *CSRG = static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
+			if (CSRG == NULL)
+				return NVGRAPH_STATUS_MAPPING_ERROR;
+			nvgraph::triangles_counting::TrianglesCount<int> counter(*CSRG); /* stream, device */
+			rc = counter.count();
+			uint64_t s_res = counter.get_triangles_count();
+			*result = static_cast<uint64_t>(s_res);
+
+		}
+		NVGRAPH_CATCHES(rc)
+		return getCAPIStatusForError(rc);
+	}
+
+} /*namespace nvgraph*/
+
+/*************************
+ *        API
+ *************************/
+
+nvgraphStatus_t NVGRAPH_API nvgraphGetProperty(libraryPropertyType type, int *value)
+																{
+	switch (type) {
+		case MAJOR_VERSION:
+			*value = CUDART_VERSION / 1000;
+			break;
+		case MINOR_VERSION:
+			*value = (CUDART_VERSION % 1000) / 10;
+			break;
+		case PATCH_LEVEL:
+			*value = 0;
+			break;
+		default:
+			return NVGRAPH_STATUS_INVALID_VALUE;
+	}
+	return NVGRAPH_STATUS_SUCCESS;
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphCreate(nvgraphHandle_t *handle)
+														{
+	return nvgraph::nvgraphCreate_impl(handle);
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphCreateMulti(nvgraphHandle_t *handle,
+																int numDevices,
+																int* devices) {
+	return nvgraph::nvgraphCreateMulti_impl(handle, numDevices, devices);
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphDestroy(nvgraphHandle_t handle)
+															{
+	return nvgraph::nvgraphDestroy_impl(handle);
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphCreateGraphDescr(nvgraphHandle_t handle,
+																		nvgraphGraphDescr_t *descrG)
+																		{
+	return nvgraph::nvgraphCreateGraphDescr_impl(handle, descrG);
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphDestroyGraphDescr(nvgraphHandle_t handle,
+																		nvgraphGraphDescr_t descrG)
+																		{
+	return nvgraph::nvgraphDestroyGraphDescr_impl(handle, descrG);
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphSetStream(nvgraphHandle_t handle, cudaStream_t stream)
+															{
+	return nvgraph::nvgraphSetStream_impl(handle, stream);
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphSetGraphStructure(nvgraphHandle_t handle,
+																		nvgraphGraphDescr_t descrG,
+																		void* topologyData,
+																		nvgraphTopologyType_t topologyType)
+																		{
+	return nvgraph::nvgraphSetGraphStructure_impl(handle, descrG, topologyData, topologyType);
+}
+nvgraphStatus_t NVGRAPH_API nvgraphGetGraphStructure(nvgraphHandle_t handle,
+																		nvgraphGraphDescr_t descrG,
+																		void* topologyData,
+																		nvgraphTopologyType_t* topologyType)
+																		{
+	return nvgraph::nvgraphGetGraphStructure_impl(handle, descrG, topologyData, topologyType);
+}
+nvgraphStatus_t NVGRAPH_API nvgraphAllocateVertexData(nvgraphHandle_t handle,
+																		nvgraphGraphDescr_t descrG,
+																		size_t numsets,
+																		cudaDataType_t *settypes)
+																		{
+	return nvgraph::nvgraphAllocateVertexData_impl(handle, descrG, numsets, settypes);
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphAllocateEdgeData(nvgraphHandle_t handle,
+																		nvgraphGraphDescr_t descrG,
+																		size_t numsets,
+																		cudaDataType_t *settypes)
+																		{
+	return nvgraph::nvgraphAllocateEdgeData_impl(handle, descrG, numsets, settypes);
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphExtractSubgraphByVertex(nvgraphHandle_t handle,
+																				nvgraphGraphDescr_t descrG,
+																				nvgraphGraphDescr_t subdescrG,
+																				int *subvertices,
+																				size_t numvertices)
+																				{
+	return nvgraph::nvgraphExtractSubgraphByVertex_impl(handle,
+																			descrG,
+																			subdescrG,
+																			subvertices,
+																			numvertices);
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphExtractSubgraphByEdge(nvgraphHandle_t handle,
+																			nvgraphGraphDescr_t descrG,
+																			nvgraphGraphDescr_t subdescrG,
+																			int *subedges,
+																			size_t numedges)
+																			{
+	return nvgraph::nvgraphExtractSubgraphByEdge_impl(handle, descrG, subdescrG, subedges, numedges);
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphSetVertexData(nvgraphHandle_t handle,
+																	nvgraphGraphDescr_t descrG,
+																	void *vertexData,
+																	size_t setnum)
+																	{
+	return nvgraph::nvgraphSetVertexData_impl(handle, descrG, vertexData, setnum);
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphGetVertexData(nvgraphHandle_t handle,
+																	nvgraphGraphDescr_t descrG,
+																	void *vertexData,
+																	size_t setnum)
+																	{
+	return nvgraph::nvgraphGetVertexData_impl(handle, descrG, vertexData, setnum);
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphConvertTopology(nvgraphHandle_t handle,
+																	nvgraphTopologyType_t srcTType,
+																	void *srcTopology,
+																	void *srcEdgeData,
+																	cudaDataType_t *dataType,
+																	nvgraphTopologyType_t dstTType,
+																	void *dstTopology,
+																	void *dstEdgeData) {
+	return nvgraph::nvgraphConvertTopology_impl(handle,
+																srcTType,
+																srcTopology,
+																srcEdgeData,
+																dataType,
+																dstTType,
+																dstTopology,
+																dstEdgeData);
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphConvertGraph(nvgraphHandle_t handle,
+																nvgraphGraphDescr_t srcDescrG,
+																nvgraphGraphDescr_t dstDescrG,
+																nvgraphTopologyType_t dstTType) {
+	return nvgraph::nvgraphConvertGraph_impl(handle, srcDescrG, dstDescrG, dstTType);
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphSetEdgeData(nvgraphHandle_t handle,
+																nvgraphGraphDescr_t descrG,
+																void *edgeData,
+																size_t setnum) {
+	return nvgraph::nvgraphSetEdgeData_impl(handle, descrG, edgeData, setnum);
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphGetEdgeData(nvgraphHandle_t handle,
+																nvgraphGraphDescr_t descrG,
+																void *edgeData,
+																size_t setnum) {
+	return nvgraph::nvgraphGetEdgeData_impl(handle, descrG, edgeData, setnum);
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphSrSpmv(nvgraphHandle_t handle,
+														const nvgraphGraphDescr_t descrG,
+														const size_t weight_index,
+														const void *alpha,
+														const size_t x,
+														const void *beta,
+														const size_t y,
+														const nvgraphSemiring_t SR) {
+	return nvgraph::nvgraphSrSpmv_impl_cub(handle, descrG, weight_index, alpha, x, beta, y, SR);
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphSssp(nvgraphHandle_t handle,
+														const nvgraphGraphDescr_t descrG,
+														const size_t weight_index,
+														const int *source_vert,
+														const size_t sssp) {
+	return nvgraph::nvgraphSssp_impl(handle, descrG, weight_index, source_vert, sssp);
+}
+
+//nvgraphTraversal
+
+typedef enum {
+	NVGRAPH_TRAVERSAL_DISTANCES_INDEX = 0,
+	NVGRAPH_TRAVERSAL_PREDECESSORS_INDEX = 1,
+	NVGRAPH_TRAVERSAL_MASK_INDEX = 2,
+	NVGRAPH_TRAVERSAL_UNDIRECTED_FLAG_INDEX = 3,
+	NVGRAPH_TRAVERSAL_ALPHA = 4,
+	NVGRAPH_TRAVERSAL_BETA = 5
+} nvgraphTraversalParameterIndex_t;
+
+nvgraphStatus_t NVGRAPH_API nvgraphTraversalParameterInit(nvgraphTraversalParameter_t *param) {
+	if (check_ptr(param))
+		return NVGRAPH_STATUS_INVALID_VALUE;
+
+	param->pad[NVGRAPH_TRAVERSAL_DISTANCES_INDEX] = INT_MAX;
+	param->pad[NVGRAPH_TRAVERSAL_PREDECESSORS_INDEX] = INT_MAX;
+	param->pad[NVGRAPH_TRAVERSAL_MASK_INDEX] = INT_MAX;
+	param->pad[NVGRAPH_TRAVERSAL_UNDIRECTED_FLAG_INDEX] = 0;
+	param->pad[NVGRAPH_TRAVERSAL_ALPHA] = TRAVERSAL_DEFAULT_ALPHA;
+	param->pad[NVGRAPH_TRAVERSAL_BETA] = TRAVERSAL_DEFAULT_BETA;
+
+	return NVGRAPH_STATUS_SUCCESS;
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetDistancesIndex(nvgraphTraversalParameter_t *param,
+																					const size_t value) {
+	if (check_ptr(param))
+		return NVGRAPH_STATUS_INVALID_VALUE;
+
+	param->pad[NVGRAPH_TRAVERSAL_DISTANCES_INDEX] = value;
+
+	return NVGRAPH_STATUS_SUCCESS;
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetDistancesIndex(	const nvgraphTraversalParameter_t param,
+																					size_t *value) {
+	if (check_ptr(value))
+		return NVGRAPH_STATUS_INVALID_VALUE;
+
+	*value = param.pad[NVGRAPH_TRAVERSAL_DISTANCES_INDEX];
+
+	return NVGRAPH_STATUS_SUCCESS;
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetPredecessorsIndex(nvgraphTraversalParameter_t *param,
+																						const size_t value) {
+	if (check_ptr(param))
+		return NVGRAPH_STATUS_INVALID_VALUE;
+
+	param->pad[NVGRAPH_TRAVERSAL_PREDECESSORS_INDEX] = value;
+
+	return NVGRAPH_STATUS_SUCCESS;
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetPredecessorsIndex(	const nvgraphTraversalParameter_t param,
+																						size_t *value) {
+	if (check_ptr(value))
+		return NVGRAPH_STATUS_INVALID_VALUE;
+
+	*value = param.pad[NVGRAPH_TRAVERSAL_PREDECESSORS_INDEX];
+
+	return NVGRAPH_STATUS_SUCCESS;
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetEdgeMaskIndex(nvgraphTraversalParameter_t *param,
+																					const size_t value) {
+	if (check_ptr(param))
+		return NVGRAPH_STATUS_INVALID_VALUE;
+
+	param->pad[NVGRAPH_TRAVERSAL_MASK_INDEX] = value;
+
+	return NVGRAPH_STATUS_SUCCESS;
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetEdgeMaskIndex(	const nvgraphTraversalParameter_t param,
+																					size_t *value) {
+	if (check_ptr(value))
+		return NVGRAPH_STATUS_INVALID_VALUE;
+
+	*value = param.pad[NVGRAPH_TRAVERSAL_MASK_INDEX];
+
+	return NVGRAPH_STATUS_SUCCESS;
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetUndirectedFlag(nvgraphTraversalParameter_t *param,
+																					const size_t value) {
+	if (check_ptr(param))
+		return NVGRAPH_STATUS_INVALID_VALUE;
+
+	param->pad[NVGRAPH_TRAVERSAL_UNDIRECTED_FLAG_INDEX] = value;
+
+	return NVGRAPH_STATUS_SUCCESS;
+
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetUndirectedFlag(	const nvgraphTraversalParameter_t param,
+																					size_t *value) {
+	if (check_ptr(value))
+		return NVGRAPH_STATUS_INVALID_VALUE;
+
+	*value = param.pad[NVGRAPH_TRAVERSAL_UNDIRECTED_FLAG_INDEX];
+
+	return NVGRAPH_STATUS_SUCCESS;
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetAlpha(nvgraphTraversalParameter_t *param,
+																		const size_t value) {
+	if (check_ptr(param))
+		return NVGRAPH_STATUS_INVALID_VALUE;
+
+	param->pad[NVGRAPH_TRAVERSAL_ALPHA] = value;
+
+	return NVGRAPH_STATUS_SUCCESS;
+
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetAlpha(const nvgraphTraversalParameter_t param,
+																		size_t *value) {
+	if (check_ptr(value))
+		return NVGRAPH_STATUS_INVALID_VALUE;
+
+	*value = param.pad[NVGRAPH_TRAVERSAL_ALPHA];
+
+	return NVGRAPH_STATUS_SUCCESS;
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetBeta(nvgraphTraversalParameter_t *param,
+																		const size_t value) {
+	if (check_ptr(param))
+		return NVGRAPH_STATUS_INVALID_VALUE;
+
+	param->pad[NVGRAPH_TRAVERSAL_BETA] = value;
+
+	return NVGRAPH_STATUS_SUCCESS;
+
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetBeta(const nvgraphTraversalParameter_t param,
+																		size_t *value) {
+	if (check_ptr(value))
+		return NVGRAPH_STATUS_INVALID_VALUE;
+
+	*value = param.pad[NVGRAPH_TRAVERSAL_BETA];
+
+	return NVGRAPH_STATUS_SUCCESS;
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphTraversal(nvgraphHandle_t handle,
+															const nvgraphGraphDescr_t descrG,
+															const nvgraphTraversal_t traversalT,
+															const int *source_vert,
+															const nvgraphTraversalParameter_t params) {
+	return nvgraph::nvgraphTraversal_impl(handle, descrG, traversalT, source_vert, params);
+}
+
+/**
+ * CAPI Method for calling 2d BFS algorithm.
+ * @param handle Nvgraph context handle.
+ * @param descrG Graph handle (must be 2D partitioned)
+ * @param source_vert The source vertex ID
+ * @param distances Pointer to memory allocated to store the distances.
+ * @param predecessors Pointer to memory allocated to store the predecessors
+ * @return Status code.
+ */
+nvgraphStatus_t NVGRAPH_API nvgraph2dBfs(nvgraphHandle_t handle,
+														const nvgraphGraphDescr_t descrG,
+														const int32_t source_vert,
+														int32_t* distances,
+														int32_t* predecessors) {
+	return nvgraph::nvgraph2dBfs_impl(handle, descrG, source_vert, distances, predecessors);
+}
+
+//nvgraphWidestPath
+
+nvgraphStatus_t NVGRAPH_API nvgraphWidestPath(nvgraphHandle_t handle,
+																const nvgraphGraphDescr_t descrG,
+																const size_t weight_index,
+																const int *source_vert,
+																const size_t widest_path)
+																{
+	return nvgraph::nvgraphWidestPath_impl(handle, descrG, weight_index, source_vert, widest_path);
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphPagerank(nvgraphHandle_t handle,
+															const nvgraphGraphDescr_t descrG,
+															const size_t weight_index,
+															const void *alpha,
+															const size_t bookmark,
+															const int has_guess,
+															const size_t pagerank_index,
+															const float tolerance,
+															const int max_iter)
+															{
+	return nvgraph::nvgraphPagerank_impl(handle,
+														descrG,
+														weight_index,
+														alpha,
+														bookmark,
+														has_guess,
+														pagerank_index,
+														tolerance,
+														max_iter);
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphKrylovPagerank(nvgraphHandle_t handle,
+																	const nvgraphGraphDescr_t descrG,
+																	const size_t weight_index,
+																	const void *alpha,
+																	const size_t bookmark,
+																	const float tolerance,
+																	const int max_iter,
+																	const int subspace_size,
+																	const int has_guess,
+																	const size_t rank)
+																	{
+	return nvgraph::nvgraphKrylovPagerank_impl(handle,
+																descrG,
+																weight_index,
+																alpha,
+																bookmark,
+																tolerance,
+																max_iter,
+																subspace_size,
+																has_guess,
+																rank);
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphBalancedCutClustering(nvgraphHandle_t handle,
+																			const nvgraphGraphDescr_t descrG,
+																			const size_t weight_index,
+																			const int n_clusters,
+																			const int n_eig_vects,
+																			const int evs_type,
+																			const float evs_tolerance,
+																			const int evs_max_iter,
+																			const float kmean_tolerance,
+																			const int kmean_max_iter,
+																			int* clustering,
+																			void* eig_vals,
+																			void* eig_vects)
+																			{
+	return nvgraph::nvgraphBalancedCutClustering_impl(handle,
+																		descrG,
+																		weight_index,
+																		n_clusters,
+																		n_eig_vects,
+																		evs_type,
+																		evs_tolerance,
+																		evs_max_iter,
+																		kmean_tolerance,
+																		kmean_max_iter,
+																		clustering,
+																		eig_vals,
+																		eig_vects);
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeBalancedCut(nvgraphHandle_t handle,
+																		const nvgraphGraphDescr_t descrG,
+																		const size_t weight_index,
+																		const int n_clusters,
+																		const int* clustering,
+																		float * edgeCut,
+																		float * ratioCut)
+																		{
+	return nvgraph::nvgraphAnalyzeBalancedCut_impl(handle,
+																	descrG,
+																	weight_index,
+																	n_clusters,
+																	clustering,
+																	edgeCut,
+																	ratioCut);
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphHeavyEdgeMatching(	nvgraphHandle_t handle,
+																		const nvgraphGraphDescr_t descrG,
+																		const size_t weight_index,
+																		const nvgraphEdgeWeightMatching_t similarity_metric,
+																		int* aggregates,
+																		size_t* num_aggregates)
+																		{
+	return nvgraph::nvgraphHeavyEdgeMatching_impl(handle,
+																	descrG,
+																	weight_index,
+																	similarity_metric,
+																	aggregates,
+																	num_aggregates);
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphSpectralModularityMaximization(nvgraphHandle_t handle,
+																						const nvgraphGraphDescr_t descrG,
+																						const size_t weight_index,
+																						const int n_clusters,
+																						const int n_eig_vects,
+																						const float evs_tolerance,
+																						const int evs_max_iter,
+																						const float kmean_tolerance,
+																						const int kmean_max_iter,
+																						int* clustering,
+																						void* eig_vals,
+																						void* eig_vects)
+																						{
+	return nvgraph::nvgraphSpectralModularityMaximization_impl(handle,
+																					descrG,
+																					weight_index,
+																					n_clusters,
+																					n_eig_vects,
+																					evs_tolerance,
+																					evs_max_iter,
+																					kmean_tolerance,
+																					kmean_max_iter,
+																					clustering,
+																					eig_vals,
+																					eig_vects);
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeModularityClustering(nvgraphHandle_t handle,
+																					const nvgraphGraphDescr_t descrG,
+																					const size_t weight_index,
+																					const int n_clusters,
+																					const int* clustering,
+																					float * modularity)
+																					{
+	return nvgraph::nvgraphAnalyzeModularityClustering_impl(handle,
+																				descrG,
+																				weight_index,
+																				n_clusters,
+																				clustering,
+																				modularity);
+}
+#ifndef NVGRAPH_LIGHT
+nvgraphStatus_t NVGRAPH_API nvgraphContractGraph(nvgraphHandle_t handle,
+																	nvgraphGraphDescr_t descrG,
+																	nvgraphGraphDescr_t contrdescrG,
+																	int *aggregates,
+																	size_t numaggregates,
+																	nvgraphSemiringOps_t VertexCombineOp,
+																	nvgraphSemiringOps_t VertexReduceOp,
+																	nvgraphSemiringOps_t EdgeCombineOp,
+																	nvgraphSemiringOps_t EdgeReduceOp,
+																	int flag)
+																	{
+	return nvgraph::nvgraphContractGraph_impl(handle,
+															descrG,
+															contrdescrG,
+															aggregates,
+															numaggregates,
+															VertexCombineOp,
+															VertexReduceOp,
+															EdgeCombineOp,
+															EdgeReduceOp,
+															flag);
+}
+#endif 
+
+nvgraphStatus_t NVGRAPH_API nvgraphSpectralClustering(nvgraphHandle_t handle, // nvGRAPH library handle.
+																		const nvgraphGraphDescr_t descrG, // nvGRAPH graph descriptor, should contain the connectivity information in NVGRAPH_CSR_32 or NVGRAPH_CSR_32 at least 1 edge set (weights)
+																		const size_t weight_index, // Index of the edge set for the weights.
+																		const struct SpectralClusteringParameter *params, //parameters, see struct SpectralClusteringParameter
+																		int* clustering, // (output) clustering
+																		void* eig_vals,   // (output) eigenvalues
+																		void* eig_vects)  // (output) eigenvectors
+																		{
+	return nvgraph::nvgraphSpectralClustering_impl(handle,
+																	descrG,
+																	weight_index,
+																	params,
+																	clustering,
+																	eig_vals,
+																	eig_vects);
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeClustering(nvgraphHandle_t handle, // nvGRAPH library handle.
+																		const nvgraphGraphDescr_t descrG, // nvGRAPH graph descriptor, should contain the connectivity information in NVGRAPH_CSR_32 at least 1 edge set (weights)
+																		const size_t weight_index, // Index of the edge set for the weights.
+																		const int n_clusters, //number of clusters
+																		const int* clustering, // clustering to analyse
+																		nvgraphClusteringMetric_t metric, // metric to compute to measure the clustering quality
+																		float * score) // (output) clustering score telling how good the clustering is for the selected metric.
+																		{
+	return nvgraph::nvgraphAnalyzeClustering_impl(handle,
+																	descrG,
+																	weight_index,
+																	n_clusters,
+																	clustering,
+																	metric,
+																	score);
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphTriangleCount(nvgraphHandle_t handle,
+																	const nvgraphGraphDescr_t descrG,
+																	uint64_t* result)
+																	{
+	return nvgraph::nvgraphTriangleCount_impl(handle, descrG, result);
+}
+
+
+nvgraphStatus_t NVGRAPH_API nvgraphLouvain (cudaDataType_t index_type, cudaDataType_t val_type, const size_t num_vertex, const size_t num_edges, 
+                            void* csr_ptr, void* csr_ind, void* csr_val, int weighted, int has_init_cluster, void* init_cluster, 
+                            void* final_modularity, void* best_cluster_vec, void* num_level)
+{
+    NVLOUVAIN_STATUS status = NVLOUVAIN_OK;
+    if ((csr_ptr == NULL) || (csr_ind == NULL) || ((csr_val == NULL) && (weighted == 1)) || 
+        ((init_cluster == NULL) && (has_init_cluster == 1)) || (final_modularity == NULL) || (best_cluster_vec == NULL) || (num_level == NULL))
+       return NVGRAPH_STATUS_INVALID_VALUE;
+
+    std::ostream log(0);
+    bool weighted_b = weighted;
+    bool has_init_cluster_b = has_init_cluster;
+    if (val_type == CUDA_R_32F)
+        status = nvlouvain::louvain ((int*)csr_ptr, (int*)csr_ind, (float*)csr_val, num_vertex, num_edges, 
+               weighted_b, has_init_cluster_b, (int*)init_cluster, *((float*)final_modularity), 
+              (int*)best_cluster_vec,*((int*)num_level), log);
+    else
+        status = nvlouvain::louvain ((int*)csr_ptr, (int*)csr_ind, (double*)csr_val, num_vertex, num_edges, 
+                weighted_b, has_init_cluster_b, (int*)init_cluster, *((double*)final_modularity), 
+                (int*)best_cluster_vec,*((int*)num_level), log);
+
+    if (status != NVLOUVAIN_OK)
+        return NVGRAPH_STATUS_INTERNAL_ERROR;
+
+    return NVGRAPH_STATUS_SUCCESS; 
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphJaccard (cudaDataType_t index_type, cudaDataType_t val_type, const size_t n, 
+                            const size_t e, void* csr_ptr, void* csr_ind, void* csr_val, int weighted, void* v, void* gamma, void* weight_j)
+{
+    int status = 0;
+
+    if ((csr_ptr == NULL) || (csr_ind == NULL) || ((csr_val == NULL) && (weighted == 1)) || (gamma == NULL) || (weight_j == NULL)) 
+        return NVGRAPH_STATUS_INVALID_VALUE;
+
+    bool weighted_b = weighted;
+
+    if (val_type == CUDA_R_32F)
+    {
+        float* weight_i = NULL, *weight_s = NULL, *work = NULL;    
+        NVG_CUDA_TRY(cudaMalloc ((void**)&weight_i, sizeof(float) * e));
+        NVG_CUDA_TRY(cudaMalloc ((void**)&weight_s, sizeof(float) * e)); 
+        if (weighted_b == true)
+        {
+            NVG_CUDA_TRY(cudaMalloc ((void**)&work, sizeof(float) * n));
+            status = nvlouvain::jaccard <true> (n, e, (int*) csr_ptr, (int*) csr_ind, (float*) csr_val, (float*) v, work, *((float*) gamma), weight_i, weight_s, (float*)weight_j);
+            NVG_CUDA_TRY(cudaFree (work));
+        }
+        else
+        {
+            NVG_CUDA_TRY(cudaMalloc ((void**)&work, sizeof(float) * n));
+            nvlouvain::fill(e, (float*)weight_j, (float)1.0);
+            status = nvlouvain::jaccard <false> (n, e, (int*) csr_ptr, (int*) csr_ind, (float*) csr_val, (float*) v, work, *((float*) gamma), weight_i, weight_s, (float*)weight_j);
+            NVG_CUDA_TRY(cudaFree (work));
+        }
+        NVG_CUDA_TRY(cudaFree (weight_s));
+        NVG_CUDA_TRY(cudaFree (weight_i));
+    }
+    else
+    {
+        double* weight_i = NULL, *weight_s = NULL, *work = NULL;    
+        NVG_CUDA_TRY(cudaMalloc ((void**)&weight_i, sizeof(double) * e));
+        NVG_CUDA_TRY(cudaMalloc ((void**)&weight_s, sizeof(double) * e));
+        if (weighted_b == true)
+        {
+            NVG_CUDA_TRY(cudaMalloc ((void**)&work, sizeof(double) * n));
+            status = nvlouvain::jaccard <true> (n, e, (int*) csr_ptr, (int*) csr_ind, (double*) csr_val, (double*) v, work, *((double*) gamma), weight_i, weight_s, (double*)weight_j);
+            NVG_CUDA_TRY(cudaFree (work));
+        }
+        else
+        {
+            NVG_CUDA_TRY(cudaMalloc ((void**)&work, sizeof(double) * n));
+            nvlouvain::fill(e, (double*)weight_j, (double)1.0);
+            status = nvlouvain::jaccard <false> (n, e, (int*) csr_ptr, (int*) csr_ind, (double*) csr_val, (double*) v, work, *((double*) gamma), weight_i, weight_s, (double*)weight_j);
+            NVG_CUDA_TRY(cudaFree (work));
+        }
+        NVG_CUDA_TRY(cudaFree (weight_s));
+        NVG_CUDA_TRY(cudaFree (weight_i));
+    }
+
+    if (status != 0)
+        return NVGRAPH_STATUS_INTERNAL_ERROR;
+
+    return NVGRAPH_STATUS_SUCCESS;
+}
+
+
+nvgraphStatus_t NVGRAPH_API nvgraphAttachGraphStructure(nvgraphHandle_t handle,
+														nvgraphGraphDescr_t descrG,
+														void* topologyData,
+														nvgraphTopologyType_t TT) {
+	return nvgraph::nvgraphAttachGraphStructure_impl( handle, descrG, topologyData, TT);
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphAttachVertexData(nvgraphHandle_t handle,
+													 nvgraphGraphDescr_t descrG,
+													 size_t setnum,
+													 cudaDataType_t settype,
+													 void *vertexData) {
+	return nvgraph::nvgraphAttachVertexData_impl( handle, descrG, setnum, settype, vertexData);
+}
+
+nvgraphStatus_t NVGRAPH_API nvgraphAttachEdgeData(nvgraphHandle_t handle,
+											      nvgraphGraphDescr_t descrG,
+											      size_t setnum,
+											      cudaDataType_t settype,
+											      void *edgeData) {
+	return nvgraph::nvgraphAttachEdgeData_impl( handle, descrG, setnum, settype, edgeData);
+}
+
diff --git a/cpp/nvgraph/cpp/src/nvgraph_cublas.cpp b/cpp/nvgraph/cpp/src/nvgraph_cublas.cpp
new file mode 100644
index 00000000000..f80f8cea09b
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/nvgraph_cublas.cpp
@@ -0,0 +1,437 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+#include <nvgraph_cublas.hxx>
+
+namespace nvgraph
+{
+
+cublasHandle_t Cublas::m_handle = 0;
+
+namespace
+{
+    cublasStatus_t cublas_axpy(cublasHandle_t handle, int n,
+                               const float* alpha,
+                               const float* x, int incx,
+                               float* y, int incy)
+    {
+        return cublasSaxpy(handle, n, alpha, x, incx, y, incy);
+    }
+
+    cublasStatus_t cublas_axpy(cublasHandle_t handle, int n,
+                               const double* alpha,
+                               const double* x, int incx,
+                               double* y, int incy)
+    {
+        return cublasDaxpy(handle, n, alpha, x, incx, y, incy);
+    }
+
+    cublasStatus_t cublas_copy(cublasHandle_t handle, int n,
+                               const float* x, int incx,
+                               float* y, int incy)
+    {
+        return cublasScopy(handle, n, x, incx, y, incy);
+    }
+
+    cublasStatus_t cublas_copy(cublasHandle_t handle, int n,
+                               const double* x, int incx,
+                               double* y, int incy)
+    {
+        return cublasDcopy(handle, n, x, incx, y, incy);
+    }
+
+    cublasStatus_t cublas_dot(cublasHandle_t handle, int n,
+                              const float* x, int incx, const float* y, int incy,
+                              float* result)
+    {
+        return cublasSdot(handle, n, x, incx, y, incy, result);
+    }
+
+    cublasStatus_t cublas_dot(cublasHandle_t handle, int n,
+                              const double* x, int incx, const double* y, int incy,
+                              double* result)
+    {
+        return cublasDdot(handle, n, x, incx, y, incy, result);
+    }
+    
+    
+     cublasStatus_t cublas_trsv_v2(cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const float *A, 
+                                                      int lda, 
+                                                      float *x, 
+                                                      int incx)
+    {
+	return cublasStrsv (handle, uplo, trans, diag, n, A, lda, x, incx);
+    }
+    cublasStatus_t cublas_trsv_v2(cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const double *A, 
+                                                      int lda, 
+                                                      double *x, 
+                                                      int incx)
+    {
+	return cublasDtrsv (handle, uplo, trans, diag, n, A, lda, x, incx);
+    }
+    
+    cublasStatus_t cublas_gemm(cublasHandle_t handle,
+                               cublasOperation_t transa, cublasOperation_t transb,
+                               int m, int n, int k,
+                               const float           *alpha,
+                               const float           *A, int lda,
+                               const float           *B, int ldb,
+                               const float           *beta,
+                               float           *C, int ldc)
+    {
+        return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+    }
+
+    cublasStatus_t cublas_gemm(cublasHandle_t handle,
+                               cublasOperation_t transa, cublasOperation_t transb,
+                               int m, int n, int k,
+                               const double          *alpha,
+                               const double          *A, int lda,
+                               const double          *B, int ldb,
+                               const double          *beta,
+                               double          *C, int ldc)
+    {
+        return cublasDgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+    }
+
+    cublasStatus_t cublas_gemv(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+                               const float *alpha, const float *A, int lda,
+                               const float *x, int incx,
+                               const float *beta, float* y, int incy)
+    {
+        return cublasSgemv(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+    }
+
+    cublasStatus_t cublas_gemv(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+                               const double *alpha, const double *A, int lda,
+                               const double *x, int incx,
+                               const double *beta, double* y, int incy)
+    {
+        return cublasDgemv(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+    }
+
+    cublasStatus_t cublas_ger(cublasHandle_t handle, int m, int n,
+                              const float* alpha,
+                              const float* x, int incx,
+                              const float* y, int incy,
+                              float* A, int lda)
+    {
+        return cublasSger(handle, m, n, alpha, x, incx, y, incy, A, lda);
+    }
+
+    cublasStatus_t cublas_ger(cublasHandle_t handle, int m, int n,
+                              const double* alpha,
+                              const double* x, int incx,
+                              const double* y, int incy,
+                              double *A, int lda)
+    {
+        return cublasDger(handle, m, n, alpha, x, incx, y, incy, A, lda);
+    }
+
+    cublasStatus_t cublas_nrm2(cublasHandle_t handle, int n,
+                               const float *x, int incx, float *result)
+    {
+        return cublasSnrm2(handle, n, x, incx, result);
+    }
+
+    cublasStatus_t cublas_nrm2(cublasHandle_t handle, int n,
+                               const double *x, int incx, double *result)
+    {
+        return cublasDnrm2(handle, n, x, incx, result);
+    }
+
+    cublasStatus_t cublas_scal(cublasHandle_t handle, int n,
+                               const float* alpha,
+                               float* x, int incx)
+    {
+        return cublasSscal(handle, n, alpha, x, incx);
+    }
+
+    cublasStatus_t cublas_scal(cublasHandle_t handle, int n,
+                               const double* alpha,
+                               double* x, int incx)
+    {
+        return cublasDscal(handle, n, alpha, x, incx);
+    }
+
+    cublasStatus_t cublas_geam(cublasHandle_t handle,
+			       cublasOperation_t transa,
+			       cublasOperation_t transb,
+			       int m, int n,
+			       const float * alpha,
+			       const float * A, int lda,
+			       const float * beta,
+			       const float * B, int ldb,
+			       float * C, int ldc) 
+    {
+        return cublasSgeam(handle, transa, transb, m, n,
+			   alpha, A, lda, beta, B, ldb, C, ldc);
+    }
+
+    cublasStatus_t cublas_geam(cublasHandle_t handle,
+			       cublasOperation_t transa,
+			       cublasOperation_t transb,
+			       int m, int n,
+			       const double * alpha,
+			       const double * A, int lda,
+			       const double * beta,
+			       const double * B, int ldb,
+			       double * C, int ldc) 
+    {
+        return cublasDgeam(handle, transa, transb, m, n,
+			   alpha, A, lda, beta, B, ldb, C, ldc);
+    }
+			     
+
+} // anonymous namespace.
+
+void Cublas::set_pointer_mode_device()
+{
+    cublasHandle_t handle = Cublas::get_handle();
+    cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
+}
+
+void Cublas::set_pointer_mode_host()
+{
+    cublasHandle_t handle = Cublas::get_handle();
+    cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
+}
+
+template <typename T>
+void Cublas::axpy(int n, T alpha,
+                  const T* x, int incx,
+                  T* y, int incy)
+{
+    cublasHandle_t handle = Cublas::get_handle();
+    CHECK_CUBLAS(cublas_axpy(handle, n, &alpha, x, incx, y, incy));
+}
+
+template <typename T>
+void Cublas::copy(int n, const T* x, int incx,
+                  T* y, int incy)
+{
+    cublasHandle_t handle = Cublas::get_handle();
+    CHECK_CUBLAS(cublas_copy(handle, n, x, incx, y, incy));
+}
+
+template <typename T>
+void Cublas::dot(int n, const T* x, int incx,
+                 const T* y, int incy,
+                 T* result)
+{
+    cublasHandle_t handle = Cublas::get_handle();
+    CHECK_CUBLAS(cublas_dot(handle, n, x, incx, y, incy, result));
+}
+
+template <typename T>
+T Cublas::nrm2(int n, const T* x, int incx)
+{
+    Cublas::get_handle();
+    T result;
+    Cublas::nrm2(n, x, incx, &result);
+    return result;
+}
+
+template <typename T>
+void Cublas::nrm2(int n, const T* x, int incx, T* result)
+{
+    cublasHandle_t handle = Cublas::get_handle();
+    CHECK_CUBLAS(cublas_nrm2(handle, n, x, incx, result));
+}
+
+template <typename T>
+void Cublas::scal(int n, T alpha, T* x, int incx)
+{
+    Cublas::scal(n, &alpha, x, incx);
+}
+
+template <typename T>
+void Cublas::scal(int n, T* alpha, T* x, int incx)
+{
+    cublasHandle_t handle = Cublas::get_handle();
+    CHECK_CUBLAS(cublas_scal(handle, n, alpha, x, incx));
+}
+
+template <typename T>
+void Cublas::gemv(bool transposed, int m, int n,
+                  const T* alpha, const T* A, int lda,
+                  const T* x, int incx,
+                  const T* beta, T* y, int incy)
+{
+    cublasHandle_t handle = Cublas::get_handle();
+    cublasOperation_t trans = transposed ? CUBLAS_OP_T : CUBLAS_OP_N;
+    CHECK_CUBLAS(cublas_gemv(handle, trans, m, n, alpha, A, lda,
+                                 x, incx, beta, y, incy));
+}
+
+template <typename T>
+void Cublas::gemv_ext(bool transposed, const int m, const int n,
+                  const T* alpha, const T* A, const int lda,
+                  const T* x, const int incx,
+                  const T* beta, T* y, const int incy, const int offsetx, const int offsety, const int offseta)
+{
+    cublasHandle_t handle = Cublas::get_handle();
+    cublasOperation_t trans = transposed ? CUBLAS_OP_T : CUBLAS_OP_N;
+    CHECK_CUBLAS(cublas_gemv(handle, trans, m, n, alpha, A+offseta, lda,
+                                 x+offsetx, incx, beta, y+offsety, incy));
+}
+
+template <typename T>
+void Cublas::trsv_v2( cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, 
+			      const T *A, int lda, T *x, int incx, int offseta)
+{
+    cublasHandle_t handle = Cublas::get_handle();
+
+    CHECK_CUBLAS( cublas_trsv_v2(handle, uplo, trans, diag, n, A+offseta, lda, x, incx));
+}
+      
+      
+template <typename T>
+void Cublas::ger(int m, int n, const T* alpha,
+                 const T* x, int incx,
+                 const T* y, int incy,
+                 T* A, int lda)
+{
+    cublasHandle_t handle = Cublas::get_handle();
+    CHECK_CUBLAS(cublas_ger(handle, m, n, alpha, x, incx, y, incy, A, lda));
+}
+
+
+template <typename T>
+void Cublas::gemm(bool transa,
+		  bool transb,
+		  int m, int n, int k,
+		  const T * alpha,
+		  const T * A, int lda,
+		  const T * B, int ldb,
+		  const T * beta,
+		  T * C, int ldc)
+{
+    cublasHandle_t handle = Cublas::get_handle();
+    cublasOperation_t cublasTransA = transa ? CUBLAS_OP_T : CUBLAS_OP_N;
+    cublasOperation_t cublasTransB = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
+    CHECK_CUBLAS(cublas_gemm(handle, cublasTransA, cublasTransB, m, n, k,
+			     alpha, A, lda, B, ldb, beta, C, ldc));
+}
+
+
+template <typename T>
+void Cublas::geam(bool transa, bool transb, int m, int n,
+		  const T * alpha, const T * A, int lda,
+		  const T * beta,  const T * B, int ldb,
+		  T * C, int ldc)
+{
+    cublasHandle_t handle = Cublas::get_handle();
+    cublasOperation_t cublasTransA = transa ? CUBLAS_OP_T : CUBLAS_OP_N;
+    cublasOperation_t cublasTransB = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
+    CHECK_CUBLAS(cublas_geam(handle, cublasTransA, cublasTransB, m, n,
+			     alpha, A, lda, beta, B, ldb, C, ldc));
+}
+
+template void Cublas::axpy(int n, float alpha,
+                           const float* x, int incx,
+                           float* y, int incy);
+template void Cublas::axpy(int n, double alpha,
+                           const double* x, int incx,
+                           double* y, int incy);
+
+template void Cublas::copy(int n, const float* x, int incx, float* y, int incy);
+template void Cublas::copy(int n, const double* x, int incx, double* y, int incy);
+
+template void Cublas::dot(int n, const float* x, int incx,
+                          const float* y, int incy,
+                          float* result);
+template void Cublas::dot(int n, const double* x, int incx,
+                          const double* y, int incy,
+                          double* result);
+
+template void Cublas::gemv(bool transposed, int m, int n,
+                           const float* alpha, const float* A, int lda,
+                           const float* x, int incx,
+                           const float* beta, float* y, int incy);
+template void Cublas::gemv(bool transposed, int m, int n,
+                           const double* alpha, const double* A, int lda,
+                           const double* x, int incx,
+                           const double* beta, double* y, int incy);
+
+template void Cublas::ger(int m, int n, const float* alpha,
+                          const float* x, int incx,
+                          const float* y, int incy,
+                          float* A, int lda);
+template void Cublas::ger(int m, int n, const double* alpha,
+                          const double* x, int incx,
+                          const double* y, int incy,
+                          double* A, int lda);
+
+
+template void Cublas::gemv_ext(bool transposed, const int m, const int n,
+                           const float* alpha, const float* A, const int lda,
+                           const float* x, const int incx,
+                           const float* beta, float* y, const int incy, const int offsetx, const int offsety, const int offseta);
+template void Cublas::gemv_ext(bool transposed, const int m, const int n,
+                           const double* alpha, const double* A, const int lda,
+                           const double* x, const int incx,
+                           const double* beta, double* y, const int incy, const int offsetx, const int offsety, const int offseta);
+
+
+template void Cublas::trsv_v2( cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, 
+			      const float *A, int lda, float *x, int incx, int offseta);
+template void Cublas::trsv_v2( cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, 
+			      const double *A, int lda, double *x, int incx, int offseta);
+
+template double Cublas::nrm2(int n, const double* x, int incx);
+template float Cublas::nrm2(int n, const float* x, int incx);
+
+template void Cublas::scal(int n, float alpha, float* x, int incx);
+template void Cublas::scal(int n, double alpha, double* x, int incx);
+
+template void Cublas::gemm(bool transa, bool transb,
+			   int m, int n, int k,
+			   const float * alpha,
+			   const float * A, int lda,
+			   const float * B, int ldb,
+			   const float * beta,
+			   float * C, int ldc);
+template void Cublas::gemm(bool transa, bool transb,
+			   int m, int n, int k,
+			   const double * alpha,
+			   const double * A, int lda,
+			   const double * B, int ldb,
+			   const double * beta,
+			   double * C, int ldc);
+
+template void Cublas::geam(bool transa, bool transb, int m, int n,
+			   const float * alpha, const float * A, int lda,
+			   const float * beta,  const float * B, int ldb,
+			   float * C, int ldc);
+template void Cublas::geam(bool transa, bool transb, int m, int n,
+			   const double * alpha, const double * A, int lda,
+			   const double * beta,  const double * B, int ldb,
+			   double * C, int ldc);
+
+
+} // end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/src/nvgraph_cusparse.cpp b/cpp/nvgraph/cpp/src/nvgraph_cusparse.cpp
new file mode 100644
index 00000000000..a0e637276c2
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/nvgraph_cusparse.cpp
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+#include <nvgraph_cusparse.hxx>
+
+namespace nvgraph
+{
+cusparseHandle_t Cusparse::m_handle = 0;
+
+namespace
+{
+  cusparseStatus_t cusparse_csrmv( cusparseHandle_t handle, cusparseOperation_t trans,
+                              int m, int n, int nnz, 
+                              const float *alpha, 
+                              const cusparseMatDescr_t descr,
+                              const float *csrVal, 
+                              const int *csrRowPtr, 
+                              const int *csrColInd,
+                              const float *x, 
+                              const float *beta, 
+                              float *y)
+  {
+      return cusparseScsrmv(handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y);
+  }
+
+  cusparseStatus_t cusparse_csrmv( cusparseHandle_t handle, cusparseOperation_t trans,
+                              int m, int n, int nnz, 
+                              const double *alpha, 
+                              const cusparseMatDescr_t descr,
+                              const double *csrVal, 
+                              const int *csrRowPtr, 
+                              const int *csrColInd,
+                              const double *x, 
+                              const double *beta, 
+                              double *y)
+  {
+    return cusparseDcsrmv(handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y);
+  }
+
+  cusparseStatus_t cusparse_csrmm(cusparseHandle_t handle, cusparseOperation_t trans,
+                                  int m, int n, int k, int nnz, 
+                                  const float *alpha, 
+                                  const cusparseMatDescr_t descr,
+                                  const float *csrVal, 
+                                  const int *csrRowPtr, 
+                                  const int *csrColInd,
+                                  const float *x,
+                                  const int ldx,     
+                                  const float *beta, 
+                                  float *y,
+                                  const int ldy)
+  {
+      return cusparseScsrmm(handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy);
+  }
+
+  cusparseStatus_t cusparse_csrmm( cusparseHandle_t handle, cusparseOperation_t trans,
+                                   int m, int n, int k, int nnz, 
+                                   const double *alpha, 
+                                   const cusparseMatDescr_t descr,
+                                   const double *csrVal, 
+                                   const int *csrRowPtr, 
+                                   const int *csrColInd,
+                                   const double *x,
+                                   const int ldx,
+                                   const double *beta, 
+                                   double *y,
+                                   const int ldy)
+  {
+      return cusparseDcsrmm(handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy);
+  }
+
+}// end anonymous namespace.
+
+// Set pointer mode
+void Cusparse::set_pointer_mode_device()
+{
+    cusparseHandle_t handle = Cusparse::get_handle();
+    cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_DEVICE);
+}
+void Cusparse::set_pointer_mode_host()
+{
+    cusparseHandle_t handle = Cusparse::get_handle();
+    cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST);
+}
+
+template <typename IndexType_, typename ValueType_>
+void Cusparse::csrmv( const bool transposed,
+                             const bool sym,
+                             const int m, const int n, const int nnz, 
+                             const ValueType_* alpha, 
+                             const ValueType_* csrVal,
+                             const IndexType_ *csrRowPtr, 
+                             const IndexType_ *csrColInd, 
+                             const ValueType_* x,
+                             const ValueType_* beta, 
+                             ValueType_* y)
+{
+  cusparseHandle_t handle = Cusparse::get_handle();
+  cusparseOperation_t trans = transposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
+  cusparseMatDescr_t descr=0;
+  CHECK_CUSPARSE(cusparseCreateMatDescr(&descr)); // we should move that somewhere else
+  if (sym)
+  {
+    CHECK_CUSPARSE(cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_SYMMETRIC));
+  }
+  else
+  {
+    CHECK_CUSPARSE(cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL));
+  }
+  CHECK_CUSPARSE(cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO));
+  CHECK_CUSPARSE(cusparse_csrmv(handle, trans , m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y));
+  CHECK_CUSPARSE(cusparseDestroyMatDescr(descr)); // we should move that somewhere else
+}
+
+template <typename IndexType_, typename ValueType_>
+void Cusparse::csrmv( const bool transposed,
+                     const bool sym,
+                     const ValueType_* alpha, 
+                     const ValuedCsrGraph<IndexType_, ValueType_>& G,
+                     const Vector<ValueType_>& x,
+                     const ValueType_* beta, 
+                     Vector<ValueType_>& y
+                     )
+{
+  cusparseHandle_t handle = Cusparse::get_handle();
+  cusparseOperation_t trans = transposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
+  cusparseMatDescr_t descr=0;
+  CHECK_CUSPARSE(cusparseCreateMatDescr(&descr)); // we should move that somewhere else
+  if (sym)
+  {
+    CHECK_CUSPARSE(cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_SYMMETRIC));
+  }
+  else
+  {
+    CHECK_CUSPARSE(cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL));
+  }
+  int n = G.get_num_vertices();
+  int nnz = G.get_num_edges();
+  CHECK_CUSPARSE(cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO));
+  CHECK_CUSPARSE(cusparse_csrmv(handle, trans , n, n, nnz, alpha, descr, (ValueType_*)G.get_raw_values(), (IndexType_*)G.get_raw_row_offsets(),(IndexType_*)G.get_raw_column_indices(), (ValueType_*)x.raw(), beta,  (ValueType_*)y.raw()));
+  CHECK_CUSPARSE(cusparseDestroyMatDescr(descr)); // we should move that somewhere else
+}
+
+template void Cusparse::csrmv( const bool transposed,
+                             const bool sym,
+                             const int m, const int n, const int nnz, 
+                             const double* alpha, 
+                             const double* csrVal,
+                             const int *csrRowPtr, 
+                             const int *csrColInd, 
+                             const double* x,
+                             const double* beta, 
+                             double* y);
+template void Cusparse::csrmv( const bool transposed,
+                             const bool sym,
+                             const int m, const int n, const int nnz, 
+                             const float* alpha, 
+                             const float* csrVal,
+                             const int *csrRowPtr, 
+                             const int *csrColInd, 
+                             const float* x,
+                             const float* beta, 
+                             float* y);
+/*
+template void Cusparse::csrmv( const bool transposed,
+                               const bool sym,
+                               const double* alpha, 
+                               const ValuedCsrGraph<int, double>& G,
+                               const Vector<double>& x,
+                               const double* beta, 
+                               Vector<double>& y
+                     );
+
+
+template void Cusparse::csrmv( const bool transposed,
+                               const bool sym,
+                               const float* alpha, 
+                               const ValuedCsrGraph<int, float>& G,
+                               const Vector<float>& x,
+                               const float* beta, 
+                               Vector<float>& y
+                     );
+*/
+
+
+template <typename IndexType_, typename ValueType_>
+void Cusparse::csrmm(const bool transposed,
+                     const bool sym,
+                     const int m, 
+                     const int n, 
+                     const int k,
+                     const int nnz, 
+                     const ValueType_* alpha, 
+                     const ValueType_* csrVal,
+                     const IndexType_* csrRowPtr, 
+                     const IndexType_* csrColInd, 
+                     const ValueType_* x,
+                     const int ldx,
+                     const ValueType_* beta, 
+                     ValueType_* y,
+                     const int ldy)
+{
+
+  cusparseHandle_t handle = Cusparse::get_handle();
+  cusparseOperation_t trans = transposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
+  cusparseMatDescr_t descr=0;
+  CHECK_CUSPARSE(cusparseCreateMatDescr(&descr)); // we should move that somewhere else
+  if (sym)
+  {
+    CHECK_CUSPARSE(cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_SYMMETRIC));
+  }
+  else
+  {
+    CHECK_CUSPARSE(cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL));
+  }
+  CHECK_CUSPARSE(cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO));
+  CHECK_CUSPARSE(cusparse_csrmm(handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy));
+  CHECK_CUSPARSE(cusparseDestroyMatDescr(descr)); // we should move that somewhere else
+}
+
+template void Cusparse::csrmm(const bool transposed,
+                              const bool sym,
+                              const int m, 
+                              const int n, 
+                              const int k, 
+                              const int nnz, 
+                              const double* alpha, 
+                              const double* csrVal,
+                              const int* csrRowPtr, 
+                              const int* csrColInd, 
+                              const double* x,
+                              const int ldx, 
+                              const double* beta, 
+                              double* y, 
+                              const int ldy);
+
+template void Cusparse::csrmm(const bool transposed,
+                              const bool sym,
+                              const int m, 
+                              const int n, 
+                              const int k, 
+                              const int nnz, 
+                              const float* alpha, 
+                              const float* csrVal,
+                              const int* csrRowPtr, 
+                              const int* csrColInd, 
+                              const float* x,
+                              const int ldx, 
+                              const float* beta, 
+                              float* y, 
+                              const int ldy);
+
+ //template <typename IndexType_, typename ValueType_>
+ void Cusparse::csr2coo( const int n, 
+                                              const int nnz, 
+                                              const int *csrRowPtr,
+                                              int *cooRowInd)
+ {
+   cusparseHandle_t handle = Cusparse::get_handle();
+   cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO ;
+   CHECK_CUSPARSE(cusparseXcsr2coo(handle, csrRowPtr, nnz, n, cooRowInd, idxBase));
+
+ }
+
+} // end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/src/nvgraph_error.cu b/cpp/nvgraph/cpp/src/nvgraph_error.cu
new file mode 100644
index 00000000000..ff8bd910aa3
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/nvgraph_error.cu
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nvgraph_error.hxx"
+
+namespace nvgraph
+{
+
+
+  void nvgraph_default_output(const char *msg, int length) {
+#if defined(DEBUG) || defined(VERBOSE_DIAG)
+    printf("%s", msg);
+#endif
+  }
+
+  NVGRAPH_output_callback nvgraph_output = nvgraph_default_output;
+  NVGRAPH_output_callback error_output = nvgraph_default_output;
+  //NVGRAPH_output_callback nvgraph_distributed_output = nvgraph_default_output;*/
+
+  // Timer 
+  struct cuda_timer::event_pair
+  {
+    cudaEvent_t start;
+    cudaEvent_t end;
+  };
+  cuda_timer::cuda_timer(): p(new event_pair()) { }
+  
+  void cuda_timer::start()
+  {
+    cudaEventCreate(&p->start);
+    cudaEventCreate(&p->end);
+    cudaEventRecord(p->start, 0);
+    cudaCheckError();
+  }
+  float cuda_timer::stop()
+  {
+    cudaEventRecord(p->end, 0);
+    cudaEventSynchronize(p->end);
+    float elapsed_time;
+    cudaEventElapsedTime(&elapsed_time, p->start, p->end);
+    cudaEventDestroy(p->start);
+    cudaEventDestroy(p->end);
+    cudaCheckError();
+    return elapsed_time;
+  }
+
+} // end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/src/nvgraph_lapack.cu b/cpp/nvgraph/cpp/src/nvgraph_lapack.cu
new file mode 100644
index 00000000000..8d167f89306
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/nvgraph_lapack.cu
@@ -0,0 +1,578 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+
+#include <nvgraph_lapack.hxx>
+//#include <f2c.h>
+//#include <complex>
+
+//#define NVGRAPH_USE_LAPACK 1
+
+namespace nvgraph
+{
+
+#define lapackCheckError(status)                         \
+    {                                                    \
+        if (status < 0)                                  \
+        {                                                \
+            std::stringstream ss;                        \
+            ss << "Lapack error: argument number "       \
+               << -status << " had an illegal value.";   \
+            FatalError(ss.str(), NVGRAPH_ERR_UNKNOWN);      \
+        }                                                \
+        else if (status > 0)                             \
+            FatalError("Lapack error: internal error.",  \
+                       NVGRAPH_ERR_UNKNOWN);                \
+    }                                                    \
+
+template <typename T>
+void Lapack<T>::check_lapack_enabled()
+{
+#ifndef NVGRAPH_USE_LAPACK
+    FatalError("Error: LAPACK not enabled.", NVGRAPH_ERR_UNKNOWN);
+#endif
+}
+
+
+typedef enum{
+    CUSOLVER_STATUS_SUCCESS=0,
+    CUSOLVER_STATUS_NOT_INITIALIZED=1,
+    CUSOLVER_STATUS_ALLOC_FAILED=2,
+    CUSOLVER_STATUS_INVALID_VALUE=3,
+    CUSOLVER_STATUS_ARCH_MISMATCH=4,
+    CUSOLVER_STATUS_MAPPING_ERROR=5,
+    CUSOLVER_STATUS_EXECUTION_FAILED=6,
+    CUSOLVER_STATUS_INTERNAL_ERROR=7,
+    CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED=8,
+    CUSOLVER_STATUS_NOT_SUPPORTED = 9,
+    CUSOLVER_STATUS_ZERO_PIVOT=10,
+    CUSOLVER_STATUS_INVALID_LICENSE=11
+} cusolverStatus_t;
+
+typedef enum {
+    CUBLAS_OP_N=0,
+    CUBLAS_OP_T=1,
+    CUBLAS_OP_C=2
+} cublasOperation_t;
+
+namespace {
+// XGEMM
+//extern "C"
+//void sgemm_(const char *transa, const char *transb,
+//        const int *m, const int *n, const int *k,
+//        const float *alpha, const float *a, const int *lda,
+//        const float *b, const int *ldb,
+//        const float *beta, float *c, const int *ldc);
+//extern "C"
+//void dgemm_(const char *transa, const char *transb,
+//        const int *m, const int *n, const int *k,
+//        const double *alpha, const double *a, const int *lda,
+//        const double *b, const int *ldb,
+//        const double *beta, double *c, const int *ldc);
+
+
+
+extern "C" cusolverStatus_t cusolverDnSgemmHost(
+    cublasOperation_t transa,
+    cublasOperation_t transb,
+    int m,
+    int n,
+    int k,
+    const float *alpha,
+    const float *A,
+    int lda,
+    const float *B,
+    int ldb,
+    const float *beta,
+    float *C,
+    int ldc);
+
+
+void lapack_gemm(const char transa, const char transb, int m, int n, int k,
+         float alpha, const float *a, int lda,
+         const float *b, int ldb,
+         float beta, float *c, int ldc)
+{
+    cublasOperation_t cublas_transa = (transa == 'N')? CUBLAS_OP_N : CUBLAS_OP_T ;
+    cublasOperation_t cublas_transb = (transb == 'N')? CUBLAS_OP_N : CUBLAS_OP_T ;
+    cusolverDnSgemmHost(cublas_transa, cublas_transb, m, n, k,
+       &alpha, (float*)a, lda, (float*)b, ldb, &beta, c, ldc);
+}
+
+extern "C" cusolverStatus_t cusolverDnDgemmHost(
+    cublasOperation_t transa,
+    cublasOperation_t transb,
+    int m,
+    int n,
+    int k,
+    const double *alpha,
+    const double *A,
+    int lda,
+    const double *B,
+    int ldb,
+    const double *beta,
+    double *C,
+    int ldc);
+
+void lapack_gemm(const signed char transa, const signed char transb, int m, int n, int k,
+         double alpha, const double *a, int lda,
+         const double *b, int ldb,
+         double beta, double *c, int ldc)
+{
+    cublasOperation_t cublas_transa = (transa == 'N')? CUBLAS_OP_N : CUBLAS_OP_T ;
+    cublasOperation_t cublas_transb = (transb == 'N')? CUBLAS_OP_N : CUBLAS_OP_T ;
+    cusolverDnDgemmHost(cublas_transa, cublas_transb, m, n, k,
+       &alpha, (double*)a, lda, (double*)b, ldb, &beta, c, ldc);
+}
+
+// XSTERF
+//extern "C"
+//void ssterf_(const int *n, float *d, float *e, int *info);
+//
+//extern "C"
+//void dsterf_(const int *n, double *d, double *e, int *info);
+//
+
+extern "C" cusolverStatus_t cusolverDnSsterfHost(
+    int n,
+    float *d,
+    float *e,
+    int *info);
+
+void lapack_sterf(int n, float * d, float * e, int * info)
+{
+    cusolverDnSsterfHost(n, d, e, info);
+}
+
+extern "C" cusolverStatus_t cusolverDnDsterfHost(
+    int n,
+    double *d,
+    double *e,
+    int *info);
+
+void lapack_sterf(int n, double * d, double * e, int * info)
+{
+    cusolverDnDsterfHost(n, d, e, info);
+}
+
+// XSTEQR
+//extern "C"
+//void ssteqr_(const char *compz, const int *n, float *d, float *e,
+//       float *z, const int *ldz, float *work, int * info);
+//extern "C"
+//void dsteqr_(const char *compz, const int *n, double *d, double *e,
+//       double *z, const int *ldz, double *work, int *info);
+
+
+extern "C" cusolverStatus_t cusolverDnSsteqrHost(
+    const signed char *compz,
+    int n,
+    float *d,
+    float *e,
+    float *z,
+    int ldz,
+    float *work,
+    int *info);
+
+void lapack_steqr(const signed char compz, int n, float * d, float * e,
+          float * z, int ldz, float * work, int * info)
+{
+    cusolverDnSsteqrHost(&compz, n, d, e, z, ldz, work, info);
+}
+
+extern "C" cusolverStatus_t cusolverDnDsteqrHost(
+    const signed char *compz,
+    int n,
+    double *d,
+    double *e,
+    double *z,
+    int ldz,
+    double *work,
+    int *info);
+
+void lapack_steqr(const signed char compz, int n, double * d, double * e,
+          double * z, int ldz, double * work, int * info)
+{
+    cusolverDnDsteqrHost(&compz, n, d, e, z, ldz, work, info);
+}
+
+#ifdef NVGRAPH_USE_LAPACK
+
+
+extern "C"
+void sgeqrf_(int *m, int *n, float *a, int *lda, float *tau, float *work, int *lwork, int *info);
+extern "C"
+void dgeqrf_(int *m, int *n, double *a, int *lda, double *tau, double *work, int *lwork, int *info);
+//extern "C"
+//void cgeqrf_(int *m, int *n, std::complex<float> *a, int *lda, std::complex<float> *tau, std::complex<float> *work, int *lwork, int *info);
+//extern "C"
+//void zgeqrf_(int *m, int *n, std::complex<double> *a, int *lda, std::complex<double> *tau, std::complex<double> *work, int *lwork, int *info);
+
+void lapack_geqrf(int m, int n, float *a, int lda, float *tau, float *work, int *lwork, int *info)
+{
+    sgeqrf_(&m, &n, a, &lda, tau, work, lwork, info);
+}
+void lapack_geqrf(int m, int n, double *a, int lda, double *tau, double *work, int *lwork, int *info)
+{
+    dgeqrf_(&m, &n, a, &lda, tau, work, lwork, info);
+}
+//void lapack_geqrf(int m, int n, std::complex<float> *a, int lda, std::complex<float> *tau, std::complex<float> *work, int *lwork, int *info)
+//{
+//    cgeqrf_(&m, &n, a, &lda, tau, work, lwork, info);
+//}
+//void lapack_geqrf(int m, int n, std::complex<double> *a, int lda, std::complex<double> *tau, std::complex<double> *work, int *lwork, int *info)
+//{
+//    zgeqrf_(&m, &n, a, &lda, tau, work, lwork, info);
+//}
+
+extern "C"
+void sormqr_ (char* side, char* trans, int *m, int *n, int *k, float *a, int *lda, const float *tau, float* c, int *ldc, float *work, int *lwork, int *info);
+extern "C"
+void dormqr_(char* side, char* trans, int *m, int *n, int *k, double *a, int *lda, const double *tau,  double* c, int *ldc, double *work, int *lwork, int *info);
+//extern "C"
+//void cunmqr_ (char* side, char* trans, int *m, int *n, int *k, std::complex<float> *a, int *lda, const std::complex<float> *tau, std::complex<float>* c, int *ldc, std::complex<float> *work, int *lwork, int *info);
+//extern "C"
+//void zunmqr_(char* side, char* trans, int *m, int *n, int *k, std::complex<double> *a, int *lda, const std::complex<double> *tau,  std::complex<double>* c, int *ldc, std::complex<double> *work, int *lwork, int *info);
+
+void lapack_ormqr(char side, char trans, int m, int n, int k, float *a, int lda, float *tau, float* c, int ldc, float *work, int *lwork, int *info)
+{
+    sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info);
+}
+void lapack_ormqr(char side, char trans, int m, int n, int k, double *a, int lda, double *tau, double* c, int ldc, double *work, int *lwork, int *info)
+{
+    dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info);
+}
+//void lapack_unmqr(char side, char trans, int m, int n, int k, std::complex<float> *a, int lda, std::complex<float> *tau, std::complex<float>* c, int ldc, std::complex<float> *work, int *lwork, int *info)
+//{
+//    cunmqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info);
+//}
+//void lapack_unmqr(char side, char trans, int m, int n, int k, std::complex<double> *a, int lda, std::complex<double> *tau, std::complex<double>* c, int ldc, std::complex<double> *work, int *lwork, int *info)
+//{
+//    zunmqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info);
+//}
+
+// extern "C"
+// void sorgqr_ ( int* m, int* n, int* k, float* a, int* lda, const float* tau, float* work, int* lwork, int *info );
+// extern "C"
+// void dorgqr_ ( int* m, int* n, int* k, double* a, int* lda, const double* tau, double* work, int* lwork, int *info );
+// 
+// void lapack_orgqr( int m, int n, int k, float* a, int lda, const float* tau, float* work, int *lwork, int *info) 
+// {
+//     sorgqr_(&m, &n, &k, a, &lda, tau, work, lwork, info);
+// }
+// void lapack_orgqr( int m, int n, int k, double* a, int lda, const double* tau, double* work, int* lwork, int *info )
+// {
+//     dorgqr_(&m, &n, &k, a, &lda, tau, work, lwork, info);
+// }
+
+//int lapack_hseqr_dispatch(char *jobvl, char *jobvr, int* n, int*ilo, int*ihi, 
+//                          double *h, int* ldh, double *wr, double *wi, double *z, 
+//                          int*ldz, double *work, int *lwork, int *info)
+//{
+//    return dhseqr_(jobvl, jobvr, n, ilo, ihi, h, ldh, wr, wi, z, ldz, work, lwork, info);
+//}
+//
+//int lapack_hseqr_dispatch(char *jobvl, char *jobvr, int* n, int*ilo, int*ihi, 
+//                          float *h, int* ldh, float *wr, float *wi, float *z, 
+//                          int*ldz, float *work, int *lwork, int *info)
+//{
+//    return shseqr_(jobvl, jobvr, n, ilo, ihi, h, ldh, wr, wi, z, ldz, work, lwork, info);
+//}
+
+
+// XGEEV
+extern "C"
+int dgeev_(char *jobvl, char *jobvr, int *n, double *a,
+           int *lda, double *wr, double *wi, double *vl,
+           int *ldvl, double *vr, int *ldvr, double *work,
+           int *lwork, int *info);
+
+extern "C"
+int sgeev_(char *jobvl, char *jobvr, int *n, float *a,
+           int *lda, float *wr, float *wi, float *vl,
+           int *ldvl, float *vr, int *ldvr, float *work,
+           int *lwork, int *info);
+
+//extern "C"
+//int dhseqr_(char *jobvl, char *jobvr, int* n, int*ilo, int*ihi, 
+//            double *h, int* ldh, double *wr, double *wi, double *z, 
+//            int*ldz, double *work, int *lwork, int *info);
+//extern "C"
+//int shseqr_(char *jobvl, char *jobvr, int* n, int*ilo, int*ihi, 
+//            float *h, int* ldh, float *wr, float *wi, float *z, 
+//            int*ldz, float *work, int *lwork, int *info);
+//
+int lapack_geev_dispatch(char *jobvl, char *jobvr, int *n, double *a,
+                         int *lda, double *wr, double *wi, double *vl,
+                         int *ldvl, double *vr, int *ldvr, double *work,
+                         int *lwork, int *info)
+{
+    return dgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info);
+}
+
+int lapack_geev_dispatch(char *jobvl, char *jobvr, int *n, float *a,
+                         int *lda, float *wr, float *wi, float *vl,
+                         int *ldvl, float *vr, int *ldvr, float *work,
+                         int *lwork, int *info)
+{
+    return sgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info);
+}
+
+
+
+
+// real eigenvalues
+template <typename T>
+void lapack_geev(T* A, T* eigenvalues, int dim, int lda)
+{
+    char job = 'N';
+    T* WI = new T[dim];
+    int ldv = 1;
+    T* vl = 0;
+    int work_size = 6 * dim;
+    T* work = new T[work_size];
+    int info;
+    lapack_geev_dispatch(&job, &job, &dim, A, &lda, eigenvalues, WI, vl, &ldv,
+                         vl, &ldv, work, &work_size, &info);
+    lapackCheckError(info);
+    delete [] WI;
+    delete [] work;
+}
+//real eigenpairs
+template <typename T>
+void lapack_geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr)
+{
+    char jobvl = 'N';
+    char jobvr = 'V';
+    T* WI = new T[dim];
+    int work_size = 6 * dim;
+    T* vl = 0;
+    int ldvl = 1;
+    T* work = new T[work_size];
+    int info;
+    lapack_geev_dispatch(&jobvl, &jobvr, &dim, A, &lda, eigenvalues, WI, vl, &ldvl,
+                         eigenvectors, &ldvr, work, &work_size, &info);
+    lapackCheckError(info);
+    delete [] WI;
+    delete [] work;
+}
+//complex eigenpairs
+template <typename T>
+void lapack_geev(T* A, T* eigenvalues_r, T* eigenvalues_i, T* eigenvectors_r, T* eigenvectors_i, int dim, int lda, int ldvr)
+{
+    char jobvl = 'N';
+    char jobvr = 'V';
+    int work_size = 8 * dim;
+    int ldvl = 1;
+    T* work = new T[work_size];
+    int info;
+    lapack_geev_dispatch(&jobvl, &jobvr, &dim, A, &lda, eigenvalues_r, eigenvalues_i, 0, &ldvl,
+                         eigenvectors_r, &ldvr, work, &work_size, &info);
+    lapackCheckError(info);
+    delete [] work;
+}
+
+//template <typename T>
+//void lapack_hseqr(T* Q, T* H, T* eigenvalues, int dim, int ldh, int ldq)
+//{
+//    char job = 'S'; // S compute eigenvalues and the Schur form T. On entry, the upper Hessenberg matrix H. 
+//                    // On exit H contains the upper quasi-triangular matrix T from the Schur decomposition
+//    char jobvr = 'V'; //Take Q on entry, and the product Q*Z is returned.
+//    //ILO and IHI are normally set by a previous call to DGEBAL, Otherwise ILO and IHI should be set to 1 and N
+//    int ilo = 1;
+//    int ihi = dim;
+//    T* WI = new T[dim];
+//    int ldv = 1;
+//    T* vl = 0;
+//    int work_size = 11 * dim; //LWORK as large as 11*N may be required for optimal performance. It is CPU memory and the matrix is assumed to be small
+//    T* work = new T[work_size];
+//    int info;
+//    lapack_hseqr_dispatch(&job, &jobvr, &dim, &ilo, &ihi, H, &ldh, eigenvalues, WI, Q, &ldq, work, &work_size, &info);
+//    lapackCheckError(info);
+//    delete [] WI;
+//    delete [] work;
+//}
+
+#endif
+
+} // end anonymous namespace
+
+template <typename T>
+void Lapack< T >::gemm(bool transa, bool transb,
+		       int m, int n, int k,
+		       T alpha, const T * A, int lda,
+		       const T * B, int ldb,
+		       T beta, T * C, int ldc)
+{
+//check_lapack_enabled();
+//#ifdef NVGRAPH_USE_LAPACK
+    const char transA_char = transa ? 'T' : 'N';
+    const char transB_char = transb ? 'T' : 'N';
+    lapack_gemm(transA_char, transB_char, m, n, k,
+		alpha, A, lda, B, ldb, beta, C, ldc);
+//#endif
+}
+
+template <typename T>
+void Lapack< T >::sterf(int n, T * d, T * e)
+{
+//    check_lapack_enabled();
+//#ifdef NVGRAPH_USE_LAPACK
+    int info;
+    lapack_sterf(n, d, e, &info);
+    lapackCheckError(info);
+//#endif
+}
+
+template <typename T>
+void Lapack< T >::steqr(char compz, int n, T * d, T * e,
+			T * z, int ldz, T * work)
+{
+//    check_lapack_enabled();
+//#ifdef NVGRAPH_USE_LAPACK
+    int info;
+    lapack_steqr(compz, n, d, e, z, ldz, work, &info);
+    lapackCheckError(info);
+//#endif
+}
+
+template <typename T>
+void Lapack< T >::geqrf(int m, int n, T *a, int lda, T *tau, T *work, int *lwork)
+{
+    check_lapack_enabled();
+    #ifdef NVGRAPH_USE_LAPACK
+        int info;
+        lapack_geqrf(m, n, a, lda, tau, work, lwork, &info);
+        lapackCheckError(info);
+    #endif
+}
+template <typename T>
+void Lapack< T >::ormqr(bool right_side, bool transq, int m, int n, int k, T *a, int lda, T *tau, T *c, int ldc, T *work, int *lwork)
+{
+    check_lapack_enabled();
+    #ifdef NVGRAPH_USE_LAPACK
+        char side = right_side ? 'R' : 'L';
+        char trans = transq ? 'T' : 'N';
+        int info;
+        lapack_ormqr(side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, &info);
+        lapackCheckError(info);
+    #endif
+}
+
+//template <typename T>
+//void Lapack< T >::unmqr(bool right_side, bool transq, int m, int n, int k, T *a, int lda, T *tau, T *c, int ldc, T *work, int *lwork)
+//{
+//    check_lapack_enabled();
+//    #ifdef NVGRAPH_USE_LAPACK
+//        char side = right_side ? 'R' : 'L';
+//        char trans = transq ? 'T' : 'N';
+//        int info;
+//        lapack_unmqr(side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, &info);
+//        lapackCheckError(info);
+//    #endif
+//}
+
+//template <typename T>
+//void Lapack< T >::orgqr( int m, int n, int k, T* a, int lda, const T* tau, T* work, int* lwork)
+//{
+//    check_lapack_enabled();
+//    #ifdef NVGRAPH_USE_LAPACK
+//        int info;
+//        lapack_orgqr(m, n, k, a, lda, tau, work, lwork, &info);
+//        lapackCheckError(info);
+//    #endif
+//}
+//template <typename T>
+//void Lapack< T >::qrf(int n, int k, T *H, T *C, T *Q, T *R)
+//{
+//    check_lapack_enabled();
+//    #ifdef NVGRAPH_USE_LAPACK
+//    //   int m = n, k = n, lda=n, lwork=2*n, info;
+//    //   lapack_geqrf(m, n, H, lda, C, work, lwork, &info);
+//    //   lapackCheckError(info);
+//    //   lapack_ormqr(m, n, k, H, lda, tau, c, ldc, work, lwork, &info);
+//    //   lapackCheckError(info);
+//    #endif
+//}
+
+//real eigenvalues 
+template <typename T>
+void Lapack< T >::geev(T* A, T* eigenvalues, int dim, int lda)
+{
+    check_lapack_enabled();
+#ifdef NVGRAPH_USE_LAPACK
+    lapack_geev(A, eigenvalues, dim, lda);
+#endif
+}
+//real eigenpairs
+template <typename T>
+void Lapack< T >::geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr)
+{
+    check_lapack_enabled();
+#ifdef NVGRAPH_USE_LAPACK
+    lapack_geev(A, eigenvalues, eigenvectors, dim, lda, ldvr);
+#endif
+}
+//complex eigenpairs
+template <typename T>
+void Lapack< T >::geev(T* A, T* eigenvalues_r, T* eigenvalues_i, T* eigenvectors_r, T* eigenvectors_i, int dim, int lda, int ldvr)
+{
+    check_lapack_enabled();
+#ifdef NVGRAPH_USE_LAPACK
+    lapack_geev(A, eigenvalues_r, eigenvalues_i, eigenvectors_r, eigenvectors_i, dim, lda, ldvr);
+#endif
+}
+
+//template <typename T>
+//void Lapack< T >::hseqr(T* Q, T* H, T* eigenvalues,T* eigenvectors, int dim, int ldh, int ldq)
+//{
+//    check_lapack_enabled();
+//#ifdef NVGRAPH_USE_LAPACK
+//    lapack_hseqr(Q, H, eigenvalues, dim, ldh, ldq);
+//#endif
+//}
+
+// Explicit instantiation
+template void Lapack<float>::check_lapack_enabled();
+template void Lapack<float>::gemm(bool transa, bool transb,int m, int n, int k,float alpha, const float * A, int lda, const float * B, int ldb, float beta, float * C, int ldc);
+template void Lapack<float>::sterf(int n, float * d, float * e);
+template void Lapack<float>::geev (float* A, float* eigenvalues, float* eigenvectors, int dim, int lda, int ldvr);
+template void Lapack<float>::geev (float* A, float* eigenvalues_r, float* eigenvalues_i, float* eigenvectors_r, float* eigenvectors_i, int dim, int lda, int ldvr);
+//template void Lapack<float>::hseqr(float* Q, float* H, float* eigenvalues, float* eigenvectors, int dim, int ldh, int ldq);
+template void Lapack<float>::steqr(char compz, int n, float * d, float * e, float * z, int ldz, float * work);
+template void Lapack<float>::geqrf(int m, int n, float *a, int lda, float *tau, float *work, int *lwork);
+template void Lapack<float>::ormqr(bool right_side, bool transq, int m, int n, int k, float *a, int lda, float *tau, float *c, int ldc, float *work, int *lwork);
+//template void Lapack<float>::orgqr(int m, int n, int k, float* a, int lda, const float* tau, float* work, int* lwork);
+
+template void Lapack<double>::check_lapack_enabled();
+template void Lapack<double>::gemm(bool transa, bool transb, int m, int n, int k, double alpha, const double * A, int lda, const double * B, int ldb, double beta, double * C, int ldc);
+template void Lapack<double>::sterf(int n, double * d, double * e);
+template void Lapack<double>::geev (double* A, double* eigenvalues, double* eigenvectors, int dim, int lda, int ldvr);
+template void Lapack<double>::geev (double* A, double* eigenvalues_r, double* eigenvalues_i, double* eigenvectors_r, double* eigenvectors_i, int dim, int lda, int ldvr);
+//template void Lapack<double>::hseqr(double* Q, double* H, double* eigenvalues, double* eigenvectors, int dim, int ldh, int ldq);
+template void Lapack<double>::steqr(char compz, int n, double * d, double * e, double * z, int ldz, double * work);
+template void Lapack<double>::geqrf(int m, int n, double *a, int lda, double *tau, double *work, int *lwork);
+template void Lapack<double>::ormqr(bool right_side, bool transq, int m, int n, int k, double *a, int lda, double *tau, double *c, int ldc, double *work, int *lwork);
+//template void Lapack<double>::orgqr(int m, int n, int k, double* a, int lda, const double* tau, double* work, int* lwork);
+
+//template void Lapack<std::complex<float> >::geqrf(int m, int n, std::complex<float> *a, int lda, std::complex<float> *tau, std::complex<float> *work, int *lwork);
+//template void Lapack<std::complex<double> >::geqrf(int m, int n, std::complex<double> *a, int lda, std::complex<double> *tau, std::complex<double> *work, int *lwork);
+//template void Lapack<std::complex<float> >::unmqr(bool right_side, bool transq, int m, int n, int k, std::complex<float> *a, int lda, std::complex<float> *tau, std::complex<float> *c, int ldc, std::complex<float> *work, int *lwork);
+//template void Lapack<std::complex<double> >::unmqr(bool right_side, bool transq, int m, int n, int k, std::complex<double> *a, int lda, std::complex<double> *tau, std::complex<double> *c, int ldc, std::complex<double> *work, int *lwork);
+
+
+}  // end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/src/nvgraph_vector_kernels.cu b/cpp/nvgraph/cpp/src/nvgraph_vector_kernels.cu
new file mode 100644
index 00000000000..f1c097026ca
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/nvgraph_vector_kernels.cu
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <algorithm>
+#include <thrust/device_vector.h>
+#include <thrust/reduce.h>
+#include "nvgraph_error.hxx"
+#include "nvgraph_vector_kernels.hxx"
+
+#include "debug_macros.h"
+
+namespace nvgraph
+{
+
+void check_size(size_t sz)
+{
+	if (sz>INT_MAX) FatalError("Vector larger than INT_MAX", NVGRAPH_ERR_BAD_PARAMETERS);
+}
+template <typename ValueType_>
+void nrm1_raw_vec (ValueType_* vec, size_t n, ValueType_* res, cudaStream_t stream)
+{
+    thrust::device_ptr<ValueType_> dev_ptr(vec);
+    *res = thrust::reduce(dev_ptr, dev_ptr+n);
+    cudaCheckError();
+}
+
+template <typename ValueType_>
+void fill_raw_vec (ValueType_* vec, size_t n , ValueType_ value, cudaStream_t stream)
+{
+    thrust::device_ptr<ValueType_> dev_ptr(vec);
+    thrust::fill(dev_ptr, dev_ptr + n, value);
+    cudaCheckError();
+}
+
+template <typename ValueType_>
+void dump_raw_vec (ValueType_* vec, size_t n, int offset, cudaStream_t stream)
+{
+#ifdef DEBUG
+    thrust::device_ptr<ValueType_> dev_ptr(vec);
+    COUT().precision(15);
+    COUT() << "sample size = "<< n << ", offset = "<< offset << std::endl;
+    thrust::copy(dev_ptr+offset,dev_ptr+offset+n, std::ostream_iterator<ValueType_>(COUT(), " "));
+    cudaCheckError();
+    COUT() << std::endl;
+#endif
+}
+
+template <typename ValueType_>
+__global__ void flag_zeroes_kernel(int num_vertices, ValueType_* vec, int* flags)
+{
+    int tidx = blockDim.x * blockIdx.x + threadIdx.x;
+    for (int r = tidx; r < num_vertices; r += blockDim.x * gridDim.x)
+    {
+        if (vec[r] != 0.0)
+            flags[r] = 1; // NOTE 2 : alpha*0 + (1-alpha)*1 = (1-alpha)
+        else
+            flags[r] = 0;
+    }
+}
+template <typename ValueType_> 
+ __global__ void dmv0_kernel(const ValueType_ * __restrict__ D, const ValueType_ * __restrict__ x, ValueType_ * __restrict__ y, int n) 
+ {
+   //y=D*x
+   int tidx = blockIdx.x*blockDim.x + threadIdx.x ;
+   for (int i = tidx; i < n; i += blockDim.x * gridDim.x)
+        y[i] = D[i]*x[i];
+}
+template <typename ValueType_> 
+ __global__ void dmv1_kernel(const ValueType_ * __restrict__ D, const ValueType_ * __restrict__ x, ValueType_ * __restrict__ y, int n) 
+ {
+   // y+=D*x
+   int tidx = blockIdx.x*blockDim.x + threadIdx.x ;
+   for (int i = tidx; i < n; i += blockDim.x * gridDim.x)
+        y[i] += D[i]*x[i];
+}
+template<typename ValueType_>
+void copy_vec(ValueType_ *vec1, size_t n, ValueType_ *res, cudaStream_t stream)
+{
+    thrust::device_ptr<ValueType_> dev_ptr(vec1);
+    thrust::device_ptr<ValueType_> res_ptr(res);
+#ifdef DEBUG
+    //COUT() << "copy "<< n << " elements" << std::endl;
+#endif
+    thrust::copy_n(dev_ptr, n, res_ptr);
+    cudaCheckError();
+    //dump_raw_vec (res, n, 0);
+}
+
+template <typename ValueType_>
+void flag_zeros_raw_vec(size_t num_vertices, ValueType_* vec, int* flags, cudaStream_t stream)
+{
+    int items_per_thread = 4;
+    int num_threads = 128;
+    int max_grid_size = 4096;
+    check_size(num_vertices);
+    int n = static_cast<int>(num_vertices);
+    int num_blocks = std::min(max_grid_size, (n/(items_per_thread*num_threads))+1);
+    flag_zeroes_kernel<<<num_blocks, num_threads, 0, stream>>>(num_vertices, vec, flags);
+    cudaCheckError();
+}
+
+template <typename ValueType_>
+void dmv (size_t num_vertices, ValueType_ alpha, ValueType_* D, ValueType_* x, ValueType_ beta, ValueType_* y, cudaStream_t stream)
+{
+    int items_per_thread = 4;
+    int num_threads = 128;
+    int max_grid_size = 4096;
+    check_size(num_vertices);
+    int n = static_cast<int>(num_vertices);
+    int num_blocks = std::min(max_grid_size, (n/(items_per_thread*num_threads))+1);
+    if (alpha ==1.0 && beta == 0.0)
+        dmv0_kernel<<<num_blocks, num_threads, 0, stream>>>(D, x, y, n);
+    else if (alpha ==1.0 && beta == 1.0)
+        dmv1_kernel<<<num_blocks, num_threads, 0, stream>>>(D, x, y, n);
+    else
+        FatalError("Not implemented case of y = D*x", NVGRAPH_ERR_BAD_PARAMETERS);
+
+    cudaCheckError();
+}
+
+template <typename IndexType_, typename ValueType_>
+void set_connectivity( size_t n, IndexType_ root, ValueType_ self_loop_val, ValueType_ unreachable_val, ValueType_* res, cudaStream_t stream)
+{
+    fill_raw_vec(res, n, unreachable_val);
+    cudaMemcpy(&res[root], &self_loop_val, sizeof(self_loop_val), cudaMemcpyHostToDevice);
+    cudaCheckError();        
+}
+
+template void nrm1_raw_vec <float> (float* vec, size_t n, float* res, cudaStream_t stream);
+template void nrm1_raw_vec <double> (double* vec, size_t n, double* res, cudaStream_t stream);
+
+template void dmv <float>(size_t num_vertices, float alpha, float* D, float* x, float beta, float* y, cudaStream_t stream);
+template void dmv <double>(size_t num_vertices, double alpha, double* D, double* x, double beta, double* y, cudaStream_t stream);
+
+template void set_connectivity <int, float> (size_t n, int root, float self_loop_val, float unreachable_val, float* res, cudaStream_t stream);
+template void set_connectivity <int, double>(size_t n, int root, double self_loop_val, double unreachable_val, double* res, cudaStream_t stream);
+
+template void flag_zeros_raw_vec <float>(size_t num_vertices, float* vec, int* flags, cudaStream_t stream);
+template void flag_zeros_raw_vec <double>(size_t num_vertices, double* vec, int* flags, cudaStream_t stream);
+
+template void fill_raw_vec<float> (float* vec, size_t n, float value, cudaStream_t stream);
+template void fill_raw_vec<double> (double* vec, size_t n, double value, cudaStream_t stream);
+template void fill_raw_vec<int> (int* vec, size_t n, int value, cudaStream_t stream);
+template void fill_raw_vec<char> (char* vec, size_t n, char value, cudaStream_t stream);
+
+template void copy_vec<float>(float * vec1, size_t n, float *res, cudaStream_t stream);
+template void copy_vec<double>(double * vec1, size_t n, double *res, cudaStream_t stream);
+template void copy_vec<int>(int * vec1, size_t n, int *res, cudaStream_t stream);
+template void copy_vec<char>(char * vec1, size_t n, char *res, cudaStream_t stream);
+
+template void dump_raw_vec<float> (float* vec, size_t n, int off, cudaStream_t stream);
+template void dump_raw_vec<double> (double* vec, size_t n, int off, cudaStream_t stream);
+template void dump_raw_vec<int> (int* vec, size_t n, int off, cudaStream_t stream);
+template void dump_raw_vec<char> (char* vec, size_t n, int off, cudaStream_t stream);
+} // end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/src/pagerank.cu b/cpp/nvgraph/cpp/src/pagerank.cu
new file mode 100644
index 00000000000..30ecc3165f5
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/pagerank.cu
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+//#define NEW_CSRMV
+
+#include "valued_csr_graph.hxx"
+#include "nvgraph_vector.hxx"
+#include "nvgraph_cusparse.hxx"
+#include "nvgraph_cublas.hxx"
+#include "nvgraph_error.hxx"
+#include "pagerank.hxx"
+#include "pagerank_kernels.hxx"
+#ifdef NEW_CSRMV
+#include "csrmv_cub.h"
+#include "cub_semiring/cub.cuh"
+#endif
+#include "nvgraph_csrmv.hxx"
+#include <algorithm>
+#include <iomanip>
+
+
+#include "debug_macros.h"
+#ifdef DEBUG
+  #define PR_VERBOSE
+#endif
+
+namespace nvgraph
+{
+template <typename IndexType_, typename ValueType_>
+Pagerank<IndexType_, ValueType_>::Pagerank(const ValuedCsrGraph <IndexType, ValueType>& network, Vector<ValueType>& dangling_nodes, cudaStream_t stream)
+    :m_network(network), m_a(dangling_nodes), m_stream(stream)
+{
+    // initialize cuda libs outside of the solve (this is slow)
+    Cusparse::get_handle();
+    Cublas::get_handle();
+    m_residual = 1000.0;
+    m_damping_factor = 0.0;
+}
+
+template <typename IndexType_, typename ValueType_>
+void Pagerank<IndexType_, ValueType_>::setup(ValueType damping_factor, Vector<ValueType>& initial_guess, Vector<ValueType>& pagerank_vector)
+{
+    int n = static_cast<int>(m_network.get_num_vertices());
+//    int nnz = static_cast<int>(m_network.get_num_edges());
+#ifdef DEBUG
+    if (n != static_cast<int>(initial_guess.get_size()) || n != static_cast<int>(m_a.get_size()) || n != static_cast<int>(pagerank_vector.get_size()))
+    {
+        CERR() << "n : " << n << std::endl;
+        CERR() << "m_network.get_num_edges() " << m_network.get_num_edges() << std::endl;
+        CERR() << "m_a : " << m_a.get_size() << std::endl;
+        CERR() << "initial_guess.get_size() : " << initial_guess.get_size() << std::endl;
+        CERR() << "pagerank_vector.get_size() : " << pagerank_vector.get_size() << std::endl;
+        FatalError("Wrong input vector in Pagerank solver.", NVGRAPH_ERR_BAD_PARAMETERS);
+    }
+#endif
+    if (damping_factor > 0.999 || damping_factor < 0.0001)
+        FatalError("Wrong damping factor value in Pagerank solver.", NVGRAPH_ERR_BAD_PARAMETERS);
+	m_damping_factor = damping_factor;
+    m_tmp = initial_guess;
+    m_pagerank = pagerank_vector;
+    //dump(m_a.raw(), 100, 0);
+	update_dangling_nodes(n, m_a.raw(), this->m_damping_factor, m_stream);
+    //dump(m_a.raw(), 100, 0);
+	m_b.allocate(n, m_stream);
+    //m_b.dump(0,n);
+    ValueType_ val =  static_cast<ValueType_>( 1.0/n);
+
+    //fill_raw_vec(m_b.raw(), n, val); 
+    // auto b = m_b.raw();
+     m_b.fill(val, m_stream);
+    // WARNING force initialization of the initial guess
+    //fill(m_tmp.raw(), n, 1.1); 
+}
+
+template <typename IndexType_, typename ValueType_>
+bool Pagerank<IndexType_, ValueType_>::solve_it()
+{
+	
+    int n = static_cast<int>(m_network.get_num_vertices()), nnz = static_cast<int>(m_network.get_num_edges());
+    int inc = 1;
+    ValueType_  dot_res;
+
+    ValueType *a = m_a.raw(),
+         *b = m_b.raw(),
+         *pr = m_pagerank.raw(),
+         *tmp = m_tmp.raw();
+    
+    // normalize the input vector (tmp)
+    if(m_iterations == 0)
+        Cublas::scal(n, (ValueType_)1.0/Cublas::nrm2(n, tmp, inc) , tmp, inc);
+    
+    //spmv : pr = network * tmp
+#ifdef NEW_CSRMV
+    ValueType_ alpha = cub_semiring::cub::PlusTimesSemiring<ValueType_>::times_ident(); // 1.
+    ValueType_ beta = cub_semiring::cub::PlusTimesSemiring<ValueType_>::times_null(); // 0.
+    SemiringDispatch<IndexType_, ValueType_>::template Dispatch< cub_semiring::cub::PlusTimesSemiring<ValueType_> >(
+        m_network.get_raw_values(),
+        m_network.get_raw_row_offsets(),
+        m_network.get_raw_column_indices(),
+        tmp,
+        pr,
+        alpha,
+        beta, 
+        n,
+        n,
+        nnz,
+        m_stream);
+#else
+    ValueType_  alpha = 1.0, beta =0.0;
+#if __cplusplus > 199711L
+    Semiring SR = Semiring::PlusTimes;
+#else
+    Semiring SR = PlusTimes;
+#endif
+    csrmv_mp<IndexType_, ValueType_>(n, n, nnz, 
+           alpha,
+           m_network,
+           tmp,
+           beta,
+           pr,
+           SR, 
+           m_stream);
+#endif
+    
+    // Rank one updates
+    Cublas::scal(n, m_damping_factor, pr, inc);
+    Cublas::dot(n, a, inc, tmp, inc, &dot_res);
+    Cublas::axpy(n, dot_res, b, inc, pr, inc);
+
+    // CVG check
+    // we need to normalize pr to compare it to tmp 
+    // (tmp has been normalized and overwitted at the beginning)
+    Cublas::scal(n, (ValueType_)1.0/Cublas::nrm2(n, pr, inc) , pr, inc);
+    
+    // v = v - x
+    Cublas::axpy(n, (ValueType_)-1.0, pr, inc, tmp, inc);
+    m_residual = Cublas::nrm2(n, tmp, inc);
+
+    if (m_residual < m_tolerance) // We know lambda = 1 for Pagerank
+    {
+        // CONVERGED
+        // WARNING Norm L1 is more standard for the output of PageRank
+        //m_pagerank.dump(0,m_pagerank.get_size());
+        Cublas::scal(m_pagerank.get_size(), (ValueType_)1.0/m_pagerank.nrm1(m_stream), pr, inc);
+        return true;
+    }
+    else
+    {
+        // m_pagerank.dump(0,m_pagerank.get_size());
+        std::swap(m_pagerank, m_tmp);
+        return false;
+    }
+}
+
+template <typename IndexType_, typename ValueType_>
+NVGRAPH_ERROR Pagerank<IndexType_, ValueType_>::solve(ValueType damping_factor, Vector<ValueType>& initial_guess, Vector<ValueType>& pagerank_vector, float tolerance, int max_it)
+{
+   
+    #ifdef PR_VERBOSE
+        std::stringstream ss;
+        ss.str(std::string());
+        size_t used_mem, free_mem, total_mem;
+        ss <<" ------------------PageRank------------------"<< std::endl;
+        ss <<" --------------------------------------------"<< std::endl;
+        ss << std::setw(10) << "Iteration" << std::setw(20) << " Mem Usage (MB)" << std::setw(15) << "Residual" << std::endl;
+        ss <<" --------------------------------------------"<< std::endl;
+        COUT()<<ss.str();
+        cuda_timer timer; timer.start();
+    #endif
+    m_max_it = max_it;
+    m_tolerance = static_cast<ValueType_>(tolerance);
+    setup(damping_factor, initial_guess, pagerank_vector);
+    bool converged = false;
+    int i = 0;
+
+    while (!converged && i < m_max_it)
+    { 
+        m_iterations = i;
+        converged = solve_it();
+        i++;
+         #ifdef PR_VERBOSE
+            ss.str(std::string());
+            cnmemMemGetInfo(&free_mem, &total_mem, NULL);
+            used_mem=total_mem-free_mem;
+            ss << std::setw(10) << i ;
+            ss.precision(3);
+            ss << std::setw(20) << std::fixed << used_mem/1024.0/1024.0;
+            ss << std::setw(15) << std::scientific << m_residual  << std::endl;
+            COUT()<<ss.str();
+        #endif
+    }
+    m_iterations = i;
+    #ifdef PR_VERBOSE
+        COUT() <<" --------------------------------------------"<< std::endl;
+        //stop timer
+        COUT() <<" Total Time : "<< timer.stop() << "ms"<<std::endl;
+        COUT() <<" --------------------------------------------"<< std::endl;
+    #endif
+    
+    if (converged)    
+    {
+        pagerank_vector = m_pagerank;
+    }
+    else
+    {
+        // still return something even if we didn't converged 
+        Cublas::scal(m_pagerank.get_size(), (ValueType_)1.0/m_tmp.nrm1(m_stream), m_tmp.raw(), 1);
+        pagerank_vector = m_tmp;
+    }
+        //m_pagerank.dump(0,m_pagerank.get_size());
+        //pagerank_vector.dump(0,pagerank_vector.get_size());
+    return converged ? NVGRAPH_OK : NVGRAPH_ERR_NOT_CONVERGED;
+}
+
+template class Pagerank<int, double>;
+template class Pagerank<int, float>;
+
+// init :
+// We actually need the transpose (=converse =reverse) of the original network, if the inuput is the original network then we have to transopose it	
+// b is a constant and uniform vector, b = 1.0/num_vertices
+// a is a constant vector that initialy store the dangling nodes then we set : a = alpha*a + (1-alpha)e
+// pagerank is 0 
+// tmp is random
+// alpha is a constant scalar (0.85 usually)
+
+//loop :
+//  pagerank = csrmv (network, tmp)
+//  scal(pagerank, alpha); //pagerank =  alpha*pagerank
+//  gamma  = dot(a, tmp); //gamma  = a*tmp
+//  pagerank = axpy(b, pagerank, gamma); // pagerank = pagerank+gamma*b
+
+// convergence check
+//  tmp = axpby(pagerank, tmp, -1, 1);	 // tmp = pagerank - tmp
+//  residual_norm = norm(tmp);               
+//  if converged (residual_norm)
+	  // l1 = l1_norm(pagerank);
+	  // pagerank = scal(pagerank, 1/l1);
+      // return pagerank 
+//  swap(tmp, pagerank)
+//end loop
+
+} // end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/src/pagerank_kernels.cu b/cpp/nvgraph/cpp/src/pagerank_kernels.cu
new file mode 100644
index 00000000000..90c90700f5d
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/pagerank_kernels.cu
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <algorithm>
+#include <thrust/device_vector.h>
+
+#include "nvgraph_error.hxx"
+#include "nvgraph_vector_kernels.hxx"
+#include "pagerank_kernels.hxx"
+
+namespace nvgraph
+{
+
+template <typename ValueType_>
+__global__ void update_dn_kernel(int num_vertices, ValueType_* aa, ValueType_ beta)
+{
+    int tidx = blockDim.x * blockIdx.x + threadIdx.x;
+    for (int r = tidx; r < num_vertices; r += blockDim.x * gridDim.x)
+    {
+        // NOTE 1 : a = alpha*a + (1-alpha)e
+        if (aa[r] == 0.0)
+            aa[r] = beta; // NOTE 2 : alpha*0 + (1-alpha)*1 = (1-alpha)
+    }
+}
+
+template <typename ValueType_>
+void update_dangling_nodes(int num_vertices, ValueType_* dangling_nodes, ValueType_ damping_factor, cudaStream_t stream)
+{
+	
+	int num_threads = 256;
+    int max_grid_size = 4096;
+    int num_blocks = std::min(max_grid_size, (num_vertices/num_threads)+1);
+    ValueType_ beta = 1.0-damping_factor;
+    update_dn_kernel<<<num_blocks, num_threads, 0, stream>>>(num_vertices, dangling_nodes,beta);
+    cudaCheckError();
+}
+
+//Explicit
+
+template void update_dangling_nodes<double> (int num_vertices, double* dangling_nodes, double damping_factor, cudaStream_t stream);
+template void update_dangling_nodes<float> (int num_vertices, float* dangling_nodes, float damping_factor, cudaStream_t stream);
+} // end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/src/partition.cu b/cpp/nvgraph/cpp/src/partition.cu
new file mode 100644
index 00000000000..c1f0dd77425
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/partition.cu
@@ -0,0 +1,812 @@
+//#ifdef NVGRAPH_PARTITION
+
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "partition.hxx"
+
+#include <stdio.h>
+#include <math.h>
+
+#include <cuda.h>
+#include <thrust/device_vector.h>
+#include <thrust/fill.h>
+#include <thrust/reduce.h>
+#include <thrust/transform.h>
+
+#include "nvgraph_error.hxx"
+#include "nvgraph_vector.hxx"
+#include "nvgraph_cublas.hxx"
+#include "matrix.hxx"
+#include "lanczos.hxx"
+#include "kmeans.hxx"
+#include "debug_macros.h"
+#include "lobpcg.hxx"
+#include "sm_utils.h"
+
+//#define COLLECT_TIME_STATISTICS 1
+//#undef COLLECT_TIME_STATISTICS
+
+#ifdef COLLECT_TIME_STATISTICS
+#include <stddef.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/sysinfo.h>
+#endif
+
+static double timer (void) {
+#ifdef COLLECT_TIME_STATISTICS
+    struct timeval tv;
+    cudaDeviceSynchronize();
+    gettimeofday(&tv, NULL);
+    return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
+#else
+    return 0.0; 
+#endif
+}
+
+
+namespace nvgraph {
+
+  // =========================================================
+  // Useful macros
+  // =========================================================
+
+  // Get index of matrix entry
+#define IDX(i,j,lda) ((i)+(j)*(lda))
+
+//    namespace {
+//      /// Get string associated with NVGRAPH error flag
+//      static
+//      const char* nvgraphGetErrorString(NVGRAPH_ERROR e) {
+//  switch(e) {
+//  case NVGRAPH_OK:                  return "NVGRAPH_OK";
+//  case NVGRAPH_ERR_BAD_PARAMETERS:  return "NVGRAPH_ERR_BAD_PARAMETERS";
+//  case NVGRAPH_ERR_UNKNOWN:         return "NVGRAPH_ERR_UNKNOWN";
+//  case NVGRAPH_ERR_CUDA_FAILURE:    return "NVGRAPH_ERR_CUDA_FAILURE";
+//  case NVGRAPH_ERR_THRUST_FAILURE:  return "NVGRAPH_ERR_THRUST_FAILURE";
+//  case NVGRAPH_ERR_IO:              return "NVGRAPH_ERR_IO";
+//  case NVGRAPH_ERR_NOT_IMPLEMENTED: return "NVGRAPH_ERR_NOT_IMPLEMENTED";
+//  case NVGRAPH_ERR_NO_MEMORY:       return "NVGRAPH_ERR_NO_MEMORY";
+//  default:                       return "unknown NVGRAPH error";
+//  }
+//      }
+//    }
+
+     template <typename IndexType_, typename ValueType_, bool Device_, bool print_transpose>
+    static int print_matrix(IndexType_ m, IndexType_ n, ValueType_ * A, IndexType_ lda, const char *s){
+        IndexType_ i,j;
+        ValueType_ * h_A;
+
+        if (m > lda) {
+            WARNING("print_matrix - invalid parameter (m > lda)");
+            return -1;
+        }
+        if (Device_) {
+            h_A = (ValueType_ *)malloc(lda*n*sizeof(ValueType_));
+            if (!h_A) {
+                WARNING("print_matrix - malloc failed");
+                return -1;
+            }
+            cudaMemcpy(h_A, A, lda*n*sizeof(ValueType_), cudaMemcpyDeviceToHost); cudaCheckError()
+        }
+        else {
+            h_A = A;
+        }
+
+        printf("%s\n",s);
+        if(print_transpose){
+            for (j=0; j<n; j++) {
+                for (i=0; i<m; i++) { //assumption m<lda
+                    printf("%8.5f, ", h_A[i+j*lda]);
+                }
+                printf("\n");
+            }
+        }
+        else {
+            for (i=0; i<m; i++) { //assumption m<lda
+                for (j=0; j<n; j++) {
+                    printf("%8.5f, ", h_A[i+j*lda]);
+                }
+                printf("\n");
+            }
+        }
+
+        if (Device_) {
+            if (h_A) free(h_A);
+        }
+        return 0;
+    }
+
+    template <typename IndexType_, typename ValueType_>
+    static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, ValueType_ *obs) {
+        IndexType_ i,j,k,index,mm;
+        ValueType_ alpha,v,last;
+        bool valid;
+        //ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension
+
+        //compute alpha
+        mm =(((m+blockDim.x-1)/blockDim.x)*blockDim.x); //m in multiple of blockDim.x
+        alpha=0.0;
+        //printf("[%d,%d,%d,%d] n=%d, li=%d, mn=%d \n",threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y, n, li, mn);    
+        for (j=threadIdx.y+blockIdx.y*blockDim.y; j<n; j+=blockDim.y*gridDim.y) {
+            for (i=threadIdx.x; i<mm; i+=blockDim.x) {
+                //check if the thread is valid
+                valid  = i<m;
+                
+                //get the value of the last thread
+                last = utils::shfl(alpha, blockDim.x-1, blockDim.x);      
+                
+                //if you are valid read the value from memory, otherwise set your value to 0
+                alpha = (valid) ? obs[i+j*m] : 0.0;
+                alpha = alpha*alpha;
+
+                //do prefix sum (of size warpSize=blockDim.x =< 32)
+                for (k=1; k<blockDim.x; k*=2) {
+                    v = utils::shfl_up(alpha, k, blockDim.x);
+                    if (threadIdx.x >= k) alpha+=v;
+                }
+                //shift by last
+                alpha+=last;
+            }
+        }
+
+        //scale by alpha      
+        alpha = utils::shfl(alpha, blockDim.x-1, blockDim.x);
+        alpha = std::sqrt(alpha); 
+        for (j=threadIdx.y+blockIdx.y*blockDim.y; j<n; j+=blockDim.y*gridDim.y) {
+            for (i=threadIdx.x; i<m; i+=blockDim.x) { //blockDim.x=32
+                index = i+j*m;
+                obs[index] = obs[index]/alpha;
+            }            
+        }
+    }
+
+    template <typename IndexType_>
+    IndexType_ next_pow2(IndexType_ n) {
+        IndexType_ v;
+        //Reference: 
+        //http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float
+        v = n-1;
+        v |= v >> 1;
+        v |= v >> 2;
+        v |= v >> 4;
+        v |= v >> 8;
+        v |= v >> 16;
+        return v+1;
+    }
+
+    template <typename IndexType_, typename ValueType_>
+    cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) {
+        IndexType_ p2m;
+        dim3 nthreads, nblocks;
+
+        //find next power of 2
+        p2m = next_pow2<IndexType_>(m);
+        //setup launch configuration
+        nthreads.x = max(2,min(p2m,32));
+        nthreads.y = 256/nthreads.x;
+        nthreads.z = 1;
+        nblocks.x  = 1;
+        nblocks.y  = (n + nthreads.y - 1)/nthreads.y;
+        nblocks.z  = 1;
+        //printf("m=%d(%d),n=%d,obs=%p, nthreads=(%d,%d,%d),nblocks=(%d,%d,%d)\n",m,p2m,n,obs,nthreads.x,nthreads.y,nthreads.z,nblocks.x,nblocks.y,nblocks.z);
+
+        //launch scaling kernel (scale each column of obs by its norm)
+        scale_obs_kernel<IndexType_,ValueType_><<<nblocks,nthreads>>>(m,n,obs);
+        cudaCheckError();
+
+        return cudaSuccess;
+    }
+
+  // =========================================================
+  // Spectral partitioner
+  // =========================================================
+
+  /// Compute spectral graph partition
+  /** Compute partition for a weighted undirected graph. This
+   *  partition attempts to minimize the cost function:
+   *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+   *
+   *  @param G Weighted graph in CSR format
+   *  @param nParts Number of partitions.
+   *  @param nEigVecs Number of eigenvectors to compute.
+   *  @param maxIter_lanczos Maximum number of Lanczos iterations.
+   *  @param restartIter_lanczos Maximum size of Lanczos system before
+   *    implicit restart.
+   *  @param tol_lanczos Convergence tolerance for Lanczos method.
+   *  @param maxIter_kmeans Maximum number of k-means iterations.
+   *  @param tol_kmeans Convergence tolerance for k-means algorithm.
+   *  @param parts (Output, device memory, n entries) Partition
+   *    assignments.
+   *  @param iters_lanczos On exit, number of Lanczos iterations
+   *    performed.
+   *  @param iters_kmeans On exit, number of k-means iterations
+   *    performed.
+   *  @return NVGRAPH error flag.
+   */
+  template <typename IndexType_, typename ValueType_>
+  NVGRAPH_ERROR partition( ValuedCsrGraph<IndexType_,ValueType_>& G,
+           IndexType_ nParts,
+           IndexType_ nEigVecs,
+           IndexType_ maxIter_lanczos,
+           IndexType_ restartIter_lanczos,
+           ValueType_ tol_lanczos,
+           IndexType_ maxIter_kmeans,
+           ValueType_ tol_kmeans,
+           IndexType_ * __restrict__ parts,
+           Vector<ValueType_> &eigVals,
+           Vector<ValueType_> &eigVecs,
+           IndexType_ & iters_lanczos,
+           IndexType_ & iters_kmeans) {
+
+    // -------------------------------------------------------
+    // Check that parameters are valid
+    // -------------------------------------------------------
+
+    if(nParts < 1) {
+      WARNING("invalid parameter (nParts<1)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(nEigVecs < 1) {
+      WARNING("invalid parameter (nEigVecs<1)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(maxIter_lanczos < nEigVecs) {
+      WARNING("invalid parameter (maxIter_lanczos<nEigVecs)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(restartIter_lanczos < nEigVecs) {
+      WARNING("invalid parameter (restartIter_lanczos<nEigVecs)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(tol_lanczos < 0) {
+      WARNING("invalid parameter (tol_lanczos<0)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(maxIter_kmeans < 0) {
+      WARNING("invalid parameter (maxIter_kmeans<0)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(tol_kmeans < 0) {
+      WARNING("invalid parameter (tol_kmeans<0)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+
+    // -------------------------------------------------------
+    // Variable declaration
+    // -------------------------------------------------------
+
+    // Useful constants
+    const ValueType_ zero = 0;
+    const ValueType_ one  = 1;
+
+    // Loop index
+    IndexType_ i;
+
+    // Matrix dimension
+    IndexType_ n = G.get_num_vertices();
+
+    // CUDA stream
+    //   TODO: handle non-zero streams
+    cudaStream_t stream = 0;
+
+    // Matrices
+    Matrix<IndexType_, ValueType_> * A;  // Adjacency matrix
+    Matrix<IndexType_, ValueType_> * L;  // Laplacian matrix
+
+    // Whether to perform full reorthogonalization in Lanczos
+    bool reorthogonalize_lanczos = false;
+
+    // k-means residual
+    ValueType_ residual_kmeans;
+
+    bool scale_eigevec_rows=SPECTRAL_USE_SCALING_OF_EIGVECS; //true; //false;
+
+    double t1=0.0,t2=0.0,t_kmeans=0.0;
+
+    // -------------------------------------------------------
+    // Spectral partitioner
+    // -------------------------------------------------------
+
+    // Compute eigenvectors of Laplacian
+    
+    // Initialize Laplacian
+    A = new CsrMatrix<IndexType_,ValueType_>(G);
+    L = new LaplacianMatrix<IndexType_,ValueType_>(*A);
+
+    // Compute smallest eigenvalues and eigenvectors
+    CHECK_NVGRAPH(computeSmallestEigenvectors(*L, nEigVecs, maxIter_lanczos,
+             restartIter_lanczos, tol_lanczos,
+             reorthogonalize_lanczos, iters_lanczos,
+             eigVals.raw(), eigVecs.raw()));   
+    //eigVals.dump(0, nEigVecs);
+    //eigVecs.dump(0, nEigVecs);
+    //eigVecs.dump(n, nEigVecs);
+    //eigVecs.dump(2*n, nEigVecs);
+    // Whiten eigenvector matrix
+    for(i=0; i<nEigVecs; ++i) {
+      ValueType_ mean, std;
+      mean = thrust::reduce(thrust::device_pointer_cast(eigVecs.raw()+IDX(0,i,n)),
+                            thrust::device_pointer_cast(eigVecs.raw()+IDX(0,i+1,n)));
+      cudaCheckError();
+      mean /= n;
+      thrust::transform(thrust::device_pointer_cast(eigVecs.raw()+IDX(0,i,n)),
+                        thrust::device_pointer_cast(eigVecs.raw()+IDX(0,i+1,n)), 
+                        thrust::make_constant_iterator(mean), 
+                        thrust::device_pointer_cast(eigVecs.raw()+IDX(0,i,n)), 
+                        thrust::minus<ValueType_>());
+      cudaCheckError();
+      std = Cublas::nrm2(n, eigVecs.raw()+IDX(0,i,n), 1)/std::sqrt(static_cast<ValueType_>(n));
+      thrust::transform(thrust::device_pointer_cast(eigVecs.raw()+IDX(0,i,n)),
+                        thrust::device_pointer_cast(eigVecs.raw()+IDX(0,i+1,n)),
+                        thrust::make_constant_iterator(std),
+                        thrust::device_pointer_cast(eigVecs.raw()+IDX(0,i,n)), 
+                        thrust::divides<ValueType_>());
+      cudaCheckError();
+    }
+
+   delete L;
+   delete A;
+
+    // Transpose eigenvector matrix
+    //   TODO: in-place transpose
+    {
+      Vector<ValueType_> work(nEigVecs*n, stream);
+      Cublas::set_pointer_mode_host();
+      Cublas::geam(true, false, nEigVecs, n,
+       &one, eigVecs.raw(), n,
+       &zero, (ValueType_*) NULL, nEigVecs,
+       work.raw(), nEigVecs);
+      CHECK_CUDA(cudaMemcpyAsync(eigVecs.raw(), work.raw(),
+         nEigVecs*n*sizeof(ValueType_),
+         cudaMemcpyDeviceToDevice));
+    }
+
+     // Clean up
+  
+
+    if (scale_eigevec_rows) {
+        //WARNING: notice that at this point the matrix has already been transposed, so we are scaling columns
+        scale_obs(nEigVecs,n,eigVecs.raw()); cudaCheckError()
+        //print_matrix<IndexType_,ValueType_,true,false>(nEigVecs-ifirst,n,obs,nEigVecs-ifirst,"Scaled obs");
+        //print_matrix<IndexType_,ValueType_,true,true>(nEigVecs-ifirst,n,obs,nEigVecs-ifirst,"Scaled obs");
+    }
+
+    t1=timer();
+
+    //eigVecs.dump(0, nEigVecs*n);
+    // Find partition with k-means clustering
+    CHECK_NVGRAPH(kmeans(n, nEigVecs, nParts, 
+          tol_kmeans, maxIter_kmeans,
+          eigVecs.raw(), parts,
+          residual_kmeans, iters_kmeans));
+    t2=timer();
+    t_kmeans+=t2-t1;
+#ifdef COLLECT_TIME_STATISTICS
+    printf("time k-means %f\n",t_kmeans);
+#endif        
+
+
+    return NVGRAPH_OK;
+  }
+
+  // =========================================================
+  // Spectral partitioner
+  // =========================================================
+
+  /// Compute spectral graph partition
+  /** Compute partition for a weighted undirected graph. This
+   *  partition attempts to minimize the cost function:
+   *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+   *
+   *  @param G Weighted graph in CSR format
+   *  @param nParts Number of partitions.
+   *  @param nEigVecs Number of eigenvectors to compute.
+   *  @param maxIter_lanczos Maximum number of Lanczos iterations.
+   *  @param restartIter_lanczos Maximum size of Lanczos system before
+   *    implicit restart.
+   *  @param tol_lanczos Convergence tolerance for Lanczos method.
+   *  @param maxIter_kmeans Maximum number of k-means iterations.
+   *  @param tol_kmeans Convergence tolerance for k-means algorithm.
+   *  @param parts (Output, device memory, n entries) Partition
+   *    assignments.
+   *  @param iters_lanczos On exit, number of Lanczos iterations
+   *    performed.
+   *  @param iters_kmeans On exit, number of k-means iterations
+   *    performed.
+   *  @return NVGRAPH error flag.
+   */
+  template <typename IndexType_, typename ValueType_>
+  NVGRAPH_ERROR partition_lobpcg( ValuedCsrGraph<IndexType_,ValueType_>& G, Matrix<IndexType_,ValueType_> * M, cusolverDnHandle_t cusolverHandle,
+           IndexType_ nParts,
+           IndexType_ nEigVecs,
+           IndexType_ maxIter_lanczos,
+           ValueType_ tol_lanczos,
+           IndexType_ maxIter_kmeans,
+           ValueType_ tol_kmeans,
+           IndexType_ * __restrict__ parts,
+           Vector<ValueType_> &eigVals,
+           Vector<ValueType_> &eigVecs,
+           IndexType_ & iters_lanczos,
+           IndexType_ & iters_kmeans) {
+
+    // -------------------------------------------------------
+    // Check that parameters are valid
+    // -------------------------------------------------------
+
+    if(nParts < 1) {
+      WARNING("invalid parameter (nParts<1)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(nEigVecs < 1) {
+      WARNING("invalid parameter (nEigVecs<1)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(maxIter_lanczos < nEigVecs) {
+      WARNING("invalid parameter (maxIter_lanczos<nEigVecs)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(tol_lanczos < 0) {
+      WARNING("invalid parameter (tol_lanczos<0)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(maxIter_kmeans < 0) {
+      WARNING("invalid parameter (maxIter_kmeans<0)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+    if(tol_kmeans < 0) {
+      WARNING("invalid parameter (tol_kmeans<0)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+
+    // -------------------------------------------------------
+    // Variable declaration
+    // -------------------------------------------------------
+
+    // Useful constants
+    const ValueType_ zero = 0;
+    const ValueType_ one  = 1;
+
+    // Loop index
+    //IndexType_ i;
+
+    // Matrix dimension
+    IndexType_ n = G.get_num_vertices();
+
+    // CUDA stream
+    //   TODO: handle non-zero streams
+    cudaStream_t stream = 0;
+
+    // Matrices
+    Matrix<IndexType_, ValueType_> * A;  // Adjacency matrix
+    Matrix<IndexType_, ValueType_> * L;  // Laplacian matrix
+
+    // k-means residual
+    ValueType_ residual_kmeans;
+
+    bool scale_eigevec_rows=SPECTRAL_USE_SCALING_OF_EIGVECS; //true; //false;
+
+    double t1=0.0,t2=0.0,t_kmeans=0.0;
+
+    // Compute eigenvectors of Laplacian
+    
+    // Initialize Laplacian
+    A = new CsrMatrix<IndexType_,ValueType_>(G);
+    L = new LaplacianMatrix<IndexType_,ValueType_>(*A);
+
+    // LOBPCG use
+    //bool use_lobpcg=SPECTRAL_USE_LOBPCG; //true; //false;
+    bool use_preconditioning=SPECTRAL_USE_PRECONDITIONING; //true; //false;
+    int lwork=0,lwork1=0,lwork2=0,lwork3=0,lwork_potrf=0,lwork_gesvd=0;
+    double t_setup=0.0,t_solve=0.0;
+    //ValueType_ * eigVals;
+    //ValueType_ * work;
+    ValueType_ * lanczosVecs=0;
+    //ValueType_ * obs;
+
+    //lanczosVecs are not allocated yet, but should not be touched in *_bufferSize routine
+    CHECK_CUSOLVER(cusolverXpotrf_bufferSize(cusolverHandle,  nEigVecs,lanczosVecs,  nEigVecs,&lwork1));
+    CHECK_CUSOLVER(cusolverXpotrf_bufferSize(cusolverHandle,2*nEigVecs,lanczosVecs,2*nEigVecs,&lwork2));
+    CHECK_CUSOLVER(cusolverXpotrf_bufferSize(cusolverHandle,3*nEigVecs,lanczosVecs,3*nEigVecs,&lwork3));
+    lwork_potrf = max(lwork1,max(lwork2,lwork3));
+    CHECK_CUSOLVER(cusolverXgesvd_bufferSize(cusolverHandle,  nEigVecs,  nEigVecs,lanczosVecs,nEigVecs,lanczosVecs,nEigVecs,lanczosVecs,nEigVecs,&lwork1));
+    CHECK_CUSOLVER(cusolverXgesvd_bufferSize(cusolverHandle,2*nEigVecs,2*nEigVecs,lanczosVecs,nEigVecs,lanczosVecs,nEigVecs,lanczosVecs,nEigVecs,&lwork2));
+    CHECK_CUSOLVER(cusolverXgesvd_bufferSize(cusolverHandle,3*nEigVecs,3*nEigVecs,lanczosVecs,nEigVecs,lanczosVecs,nEigVecs,lanczosVecs,nEigVecs,&lwork3));
+    lwork_gesvd = max(lwork1,max(lwork2,lwork3));
+    lwork = max(lwork_potrf,lwork_gesvd);
+    //allocating +2 to hold devInfo for cuSolver, which is of type int, using 2 rather than 1 just in case
+    //sizeof(ValueType_) < sizeof(IntType_). Notice that this ratio will not be more than 2.
+    //6*nEigVecs*n - Y=[X,R,P] and Z=[Q,T,V], where X and others are of size nEigVecs x n
+    //36*nEigVecs*nEigVecs for G, H, HU and HVT, each of max size 3*nEigVecs x 3*nEigVecs
+    //nEigVecs - nrmR
+    //lwork - Workspace max Lwork value (for either potrf or gesvd)
+    //2 - devInfo
+    cudaMalloc(&lanczosVecs, (9*nEigVecs*n + 36*nEigVecs*nEigVecs + nEigVecs + lwork+2)*sizeof(ValueType_)); 
+    cudaCheckError();
+
+    //Setup preconditioner M for Laplacian L
+    t1=timer();
+    if (use_preconditioning) {
+        L->prec_setup(M);
+    }
+    t2=timer();
+    t_setup+=t2-t1;
+
+    //Run the eigensolver (with preconditioning)
+    t1=timer();
+    if(lobpcg_simplified(Cublas::get_handle(),cusolverHandle, 
+                                  n, nEigVecs, L, 
+                                  eigVecs.raw(), eigVals.raw(),
+                                  maxIter_lanczos,tol_lanczos,
+                                  lanczosVecs, //work array (on device)
+                                  iters_lanczos) != 0)
+    {
+      WARNING("error in eigensolver");
+      return NVGRAPH_ERR_UNKNOWN;
+    }
+                
+    t2=timer();
+    t_solve+=t2-t1;
+    #ifdef COLLECT_TIME_STATISTICS
+    printf("time eigsolver setup %f\n",t_setup);
+    printf("time eigsolver solve %f\n",t_solve);
+    #endif    
+
+    delete L;
+    delete A;
+    // Transpose eigenvector matrix
+    //   TODO: in-place transpose
+    {
+      Vector<ValueType_> work(nEigVecs*n, stream);
+      Cublas::set_pointer_mode_host();
+      Cublas::geam(true, false, nEigVecs, n,
+       &one, eigVecs.raw(), n,
+       &zero, (ValueType_*) NULL, nEigVecs,
+       work.raw(), nEigVecs);
+      CHECK_CUDA(cudaMemcpyAsync(eigVecs.raw(), work.raw(),
+         nEigVecs*n*sizeof(ValueType_),
+         cudaMemcpyDeviceToDevice));
+    }
+
+    if (scale_eigevec_rows) {
+        //WARNING: notice that at this point the matrix has already been transposed, so we are scaling columns
+        scale_obs(nEigVecs,n,eigVecs.raw()); cudaCheckError();
+        //print_matrix<IndexType_,ValueType_,true,false>(nEigVecs-ifirst,n,obs,nEigVecs-ifirst,"Scaled obs");
+        //print_matrix<IndexType_,ValueType_,true,true>(nEigVecs-ifirst,n,obs,nEigVecs-ifirst,"Scaled obs");
+    }
+
+    t1=timer();
+
+    //eigVecs.dump(0, nEigVecs*n);
+    // Find partition with k-means clustering
+    CHECK_NVGRAPH(kmeans(n, nEigVecs, nParts, 
+          tol_kmeans, maxIter_kmeans,
+          eigVecs.raw(), parts,
+          residual_kmeans, iters_kmeans));
+    t2=timer();
+    t_kmeans+=t2-t1;
+#ifdef COLLECT_TIME_STATISTICS
+    printf("time k-means %f\n",t_kmeans);
+#endif        
+
+    return NVGRAPH_OK;
+  }
+
+  // =========================================================
+  // Analysis of graph partition
+  // =========================================================
+
+  namespace {
+    /// Functor to generate indicator vectors
+    /** For use in Thrust transform
+     */
+    template <typename IndexType_, typename ValueType_>
+    struct equal_to_i_op {
+      const IndexType_ i;
+    public:
+      equal_to_i_op(IndexType_ _i) : i(_i) {}
+      template<typename Tuple_>
+      __host__ __device__ void operator()(Tuple_ t) {
+  thrust::get<1>(t)
+    = (thrust::get<0>(t) == i) ? (ValueType_) 1.0 : (ValueType_) 0.0;
+      }
+    };
+  }
+
+  /// Compute cost function for partition
+  /** This function determines the edges cut by a partition and a cost
+   *  function:
+   *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+   *  Graph is assumed to be weighted and undirected.
+   *
+   *  @param G Weighted graph in CSR format
+   *  @param nParts Number of partitions.
+   *  @param parts (Input, device memory, n entries) Partition
+   *    assignments.
+   *  @param edgeCut On exit, weight of edges cut by partition.
+   *  @param cost On exit, partition cost function.
+   *  @return NVGRAPH error flag.
+   */
+  template <typename IndexType_, typename ValueType_>
+  NVGRAPH_ERROR analyzePartition(ValuedCsrGraph<IndexType_,ValueType_> & G,
+            IndexType_ nParts,
+            const IndexType_ * __restrict__ parts,
+            ValueType_ & edgeCut, ValueType_ & cost) {
+    
+    //using namespace thrust;
+
+    // -------------------------------------------------------
+    // Variable declaration
+    // -------------------------------------------------------
+
+    // Loop index
+    IndexType_ i;
+
+    // Matrix dimension
+    IndexType_ n = G.get_num_vertices();
+
+    // Values for computing partition cost
+    ValueType_ partEdgesCut, partSize;
+
+    // CUDA stream
+    //   TODO: handle non-zero streams
+    cudaStream_t stream = 0;
+    
+    // Device memory
+    Vector<ValueType_> part_i(n, stream);
+    Vector<ValueType_> Lx(n, stream);
+
+    // Adjacency and Laplacian matrices
+    Matrix<IndexType_, ValueType_> * A;
+    Matrix<IndexType_, ValueType_> * L;
+
+    // -------------------------------------------------------
+    // Implementation
+    // -------------------------------------------------------
+
+    // Check that parameters are valid
+    if(nParts < 1) {
+      WARNING("invalid parameter (nParts<1)");
+      return NVGRAPH_ERR_BAD_PARAMETERS;
+    }
+
+    // Initialize cuBLAS
+    Cublas::set_pointer_mode_host();
+
+    // Initialize Laplacian
+    A = new CsrMatrix<IndexType_,ValueType_>(G);
+    L = new LaplacianMatrix<IndexType_,ValueType_>(*A);
+
+    // Initialize output
+    cost    = 0;
+    edgeCut = 0;
+
+    // Iterate through partitions
+    for(i=0; i<nParts; ++i) {
+    
+      // Construct indicator vector for ith partition
+      thrust::for_each( thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(parts),
+                thrust::device_pointer_cast(part_i.raw()))),
+                thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(parts+n),
+                thrust::device_pointer_cast(part_i.raw()+n))),
+                equal_to_i_op<IndexType_,ValueType_>(i));
+      cudaCheckError();
+
+      // Compute size of ith partition
+      Cublas::dot(n, part_i.raw(), 1, part_i.raw(), 1, &partSize);
+      partSize = round(partSize);
+      if(partSize < 0.5) {
+  WARNING("empty partition");
+  continue;
+      }
+         
+      // Compute number of edges cut by ith partition
+      L->mv(1, part_i.raw(), 0, Lx.raw());
+      Cublas::dot(n, Lx.raw(), 1, part_i.raw(), 1, &partEdgesCut);
+
+      // Record results
+      cost    += partEdgesCut/partSize;
+      edgeCut += partEdgesCut/2;
+
+    }
+
+    // Clean up and return
+    delete L;
+    delete A;
+    return NVGRAPH_OK;
+
+  }
+
+  // =========================================================
+  // Explicit instantiation
+  // =========================================================
+  template
+  NVGRAPH_ERROR partition<int,float>( ValuedCsrGraph<int,float> & G,
+          int nParts,
+          int nEigVecs,
+          int maxIter_lanczos,
+          int restartIter_lanczos,
+          float tol_lanczos,
+          int maxIter_kmeans,
+          float tol_kmeans,
+          int * __restrict__ parts,
+          Vector<float> &eigVals,
+          Vector<float> &eigVecs,
+          int & iters_lanczos,
+          int & iters_kmeans);
+  template
+  NVGRAPH_ERROR partition<int,double>( ValuedCsrGraph<int,double> & G,
+           int nParts,
+           int nEigVecs,
+           int maxIter_lanczos,
+           int restartIter_lanczos,
+           double tol_lanczos,
+           int maxIter_kmeans,
+           double tol_kmeans,
+           int * __restrict__ parts,
+           Vector<double> &eigVals,
+           Vector<double> &eigVecs,
+           int & iters_lanczos,
+           int & iters_kmeans);
+
+
+
+  template 
+  NVGRAPH_ERROR partition_lobpcg<int,float>(ValuedCsrGraph<int,float> & G,
+           Matrix<int,float> * M, 
+           cusolverDnHandle_t cusolverHandle,
+           int nParts,
+           int nEigVecs,
+           int maxIter_lanczos,
+           float tol_lanczos,
+           int maxIter_kmeans,
+           float tol_kmeans,
+           int * __restrict__ parts,
+           Vector<float> &eigVals,
+           Vector<float> &eigVecs,
+           int & iters_lanczos,
+           int & iters_kmeans);
+
+  template 
+  NVGRAPH_ERROR partition_lobpcg<int,double>(ValuedCsrGraph<int,double> & G,
+           Matrix<int,double> * M, 
+           cusolverDnHandle_t cusolverHandle,
+           int nParts,
+           int nEigVecs,
+           int maxIter_lanczos,
+           double tol_lanczos,
+           int maxIter_kmeans,
+           double tol_kmeans,
+           int * __restrict__ parts,
+           Vector<double> &eigVals,
+           Vector<double> &eigVecs,
+           int & iters_lanczos,
+           int & iters_kmeans);
+  template
+  NVGRAPH_ERROR analyzePartition<int,float>(ValuedCsrGraph<int,float> & G,
+           int nParts,
+           const int * __restrict__ parts,
+           float & edgeCut, float & cost);
+  template
+  NVGRAPH_ERROR analyzePartition<int,double>(ValuedCsrGraph<int,double> & G,
+            int nParts,
+            const int * __restrict__ parts,
+            double & edgeCut, double & cost);
+
+}
+//#endif //NVGRAPH_PARTITION
+
diff --git a/cpp/nvgraph/cpp/src/size2_selector.cu b/cpp/nvgraph/cpp/src/size2_selector.cu
new file mode 100644
index 00000000000..4395dfbbf97
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/size2_selector.cu
@@ -0,0 +1,299 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+#include <nvgraph_cusparse.hxx>
+#include <size2_selector.hxx>
+#include <common_selector.hxx>
+#include <async_event.hxx>
+
+#include <thrust/device_vector.h>
+#include <thrust/count.h> //count
+#include <thrust/sort.h> //sort
+#include <thrust/binary_search.h> //lower_bound
+#include <thrust/unique.h> //unique
+
+// This should be enabled
+#define EXPERIMENTAL_ITERATIVE_MATCHING
+
+namespace nvgraph {
+
+
+template <typename IndexType>
+void renumberAndCountAggregates(Vector<IndexType> &aggregates, const IndexType n, IndexType& num_aggregates)
+{
+  // renumber aggregates
+  Vector<IndexType> scratch(n+1);
+  scratch.fill(0);
+   thrust::device_ptr<IndexType> aggregates_thrust_dev_ptr(aggregates.raw());
+   thrust::device_ptr<IndexType> scratch_thrust_dev_ptr(scratch.raw());
+
+  // set scratch[aggregates[i]] = 1
+  thrust::fill(thrust::make_permutation_iterator(scratch_thrust_dev_ptr, aggregates_thrust_dev_ptr),
+               thrust::make_permutation_iterator(scratch_thrust_dev_ptr, aggregates_thrust_dev_ptr + n), 1);
+
+  //scratch.dump(0,scratch.get_size());
+
+  // do prefix sum on scratch
+  thrust::exclusive_scan(scratch_thrust_dev_ptr, scratch_thrust_dev_ptr+n+1, scratch_thrust_dev_ptr);
+ // scratch.dump(0,scratch.get_size());
+
+  // aggregates[i] = scratch[aggregates[i]]
+  thrust::copy(thrust::make_permutation_iterator(scratch_thrust_dev_ptr, aggregates_thrust_dev_ptr),
+               thrust::make_permutation_iterator(scratch_thrust_dev_ptr, aggregates_thrust_dev_ptr + n),
+         aggregates_thrust_dev_ptr);
+  cudaCheckError();
+  cudaMemcpy(&num_aggregates, &scratch.raw()[scratch.get_size()-1], sizeof(int), cudaMemcpyDefault); //num_aggregates = scratch.raw()[scratch.get_size()-1];
+  cudaCheckError();
+
+}
+
+// ------------------
+// Constructors
+// ------------------
+
+template <typename IndexType, typename ValueType>
+Size2Selector<IndexType, ValueType>::Size2Selector()
+{
+  //Using default vaues from AmgX
+  m_deterministic = 1;
+  m_stream=0;
+  m_max_iterations = 15;
+  m_numUnassigned_tol = 0.05;
+  m_two_phase =  0;
+  m_aggregation_edge_weight_component= 0;
+  m_merge_singletons = 1;
+  m_weight_formula = 0;
+  m_similarity_metric = SCALED_BY_ROW_SUM;
+}
+
+// ------------------
+// Methods
+// ------------------
+
+// setAggregates for block_dia_csr_matrix_d format
+template <typename IndexType, typename ValueType>
+NVGRAPH_ERROR Size2Selector<IndexType, ValueType>::setAggregates_common_sqblocks(const ValuedCsrGraph<IndexType, ValueType> &A, Vector<IndexType> &aggregates, int &num_aggregates)
+{
+  const IndexType n = (int) A.get_num_vertices();
+  const IndexType nnz = (int) A.get_num_edges();
+  const IndexType *A_row_offsets_ptr = A.get_raw_row_offsets();
+  const IndexType *A_column_indices_ptr = A.get_raw_column_indices();
+  const ValueType *A_nonzero_values_ptr = A.get_raw_values();
+  
+  // compute row indices
+  Vector<IndexType> row_indices(nnz);
+  Cusparse::csr2coo( n, nnz, A_row_offsets_ptr, row_indices.raw()); // note : amgx uses cusp for that 
+  const IndexType *A_row_indices_ptr = row_indices.raw();
+  
+  //All vectors should be initialized to -1.
+  aggregates.fill(-1);
+  Vector<IndexType> strongest_neighbour(n);
+  strongest_neighbour.fill(-1);
+  Vector<IndexType> strongest_neighbour_1phase(n);
+  strongest_neighbour_1phase.fill(-1);
+  Vector<float> edge_weights(nnz);
+  edge_weights.fill(-1);
+  float *edge_weights_ptr  = edge_weights.raw();
+  float *rand_edge_weights_ptr = NULL;
+  cudaCheckError();
+
+  IndexType *strongest_neighbour_ptr = strongest_neighbour.raw();
+  IndexType *strongest_neighbour_1phase_ptr = strongest_neighbour_1phase.raw();
+  IndexType *aggregates_ptr = aggregates.raw();
+
+  const int threads_per_block = 256;
+  const int max_grid_size = 256;
+  const int num_blocks = min( max_grid_size, (n-1)/threads_per_block+ 1 );
+  const int num_blocks_V2 = min( max_grid_size, (nnz-1)/threads_per_block + 1);
+  int bsize = 1; // AmgX legacy: we don't use block CSR matrices, this is just to specify that we run on regular matrices
+
+  int numUnassigned = n;
+  int numUnassigned_previous = numUnassigned;
+  thrust::device_ptr<IndexType> aggregates_thrust_dev_ptr(aggregates_ptr);
+  switch(m_similarity_metric)
+  {
+     case USER_PROVIDED : 
+     {         
+         //copy non wero values of A in edge_weights (float)
+         convert_type<<<num_blocks_V2,threads_per_block,0,this->m_stream>>>(nnz, A_nonzero_values_ptr, edge_weights_ptr);
+         cudaCheckError();
+         //edge_weights.dump(0,nnz);
+         break; 
+     }
+     case SCALED_BY_ROW_SUM : 
+     { 
+        // Compute the edge weights using .5*(A_ij+A_ji)/max(d(i),d(j)) where d(i) is the sum of outgoing edges of i
+        Vector<ValueType> row_sum(n);
+        const ValueType *A_row_sum_ptr = row_sum.raw(); 
+        Vector<ValueType> ones(n);
+        ones.fill(1.0);
+        ValueType alpha = 1.0, beta =0.0;
+        Cusparse::csrmv(false, false, n, n, nnz,&alpha,A_nonzero_values_ptr, A_row_offsets_ptr, A_column_indices_ptr, ones.raw(),&beta, row_sum.raw());
+        cudaFuncSetCacheConfig(computeEdgeWeightsBlockDiaCsr_V2<IndexType,ValueType,float>,cudaFuncCachePreferL1);
+        computeEdgeWeights_simple<<<num_blocks_V2,threads_per_block,0,this->m_stream>>>(A_row_offsets_ptr, A_row_indices_ptr, A_column_indices_ptr, A_row_sum_ptr, A_nonzero_values_ptr, nnz, edge_weights_ptr, rand_edge_weights_ptr, n, this->m_weight_formula);
+        cudaCheckError();  
+        break; 
+     }
+     case SCALED_BY_DIAGONAL : 
+     { 
+       // Compute the edge weights using AmgX formula (works only if there is a diagonal entry for each row)
+       Vector<IndexType> diag_idx(n);
+       const IndexType *A_dia_idx_ptr = diag_idx.raw();
+
+       computeDiagonalKernelCSR<<<num_blocks,threads_per_block,0,this->m_stream>>>(n, A.get_raw_row_offsets(), A.get_raw_column_indices(), diag_idx.raw());
+       cudaCheckError();
+
+       cudaFuncSetCacheConfig(computeEdgeWeightsBlockDiaCsr_V2<IndexType,ValueType,float>,cudaFuncCachePreferL1);
+       computeEdgeWeightsBlockDiaCsr_V2<<<num_blocks_V2,threads_per_block,0,this->m_stream>>>(A_row_offsets_ptr, A_row_indices_ptr, A_column_indices_ptr, A_dia_idx_ptr, A_nonzero_values_ptr, nnz, edge_weights_ptr, rand_edge_weights_ptr, n, bsize,this->m_aggregation_edge_weight_component, this->m_weight_formula);
+       cudaCheckError();  
+       break; 
+     }
+     default: return NVGRAPH_ERR_BAD_PARAMETERS;
+  }
+  
+#ifdef EXPERIMENTAL_ITERATIVE_MATCHING
+  // TODO (from amgx): allocate host pinned memory
+  AsyncEvent *throttle_event = new AsyncEvent;
+  throttle_event->create();
+  std::vector<IndexType> h_unagg_vec(1);
+  Vector<IndexType> d_unagg_vec(1);
+
+  int *unaggregated = &h_unagg_vec[0];
+  int *d_unaggregated = d_unagg_vec.raw();
+
+#endif
+
+  int icount, s = 1;
+  {
+    icount = 0;
+    float *weights_ptr = edge_weights_ptr;
+    
+    do 
+    {
+      if( !this->m_two_phase ) {
+      // 1-phase handshaking
+        findStrongestNeighbourBlockDiaCsr_V2<<<num_blocks,threads_per_block,0,this->m_stream>>>(A_row_offsets_ptr, A_column_indices_ptr, weights_ptr, n, aggregates_ptr, strongest_neighbour_ptr, strongest_neighbour_ptr, bsize, 1, this->m_merge_singletons);
+        cudaCheckError();
+
+      } 
+      else { 
+      // 2-phase handshaking
+        findStrongestNeighbourBlockDiaCsr_V2<<<num_blocks,threads_per_block,0,this->m_stream>>>(A_row_offsets_ptr, A_column_indices_ptr, weights_ptr, n, aggregates_ptr, strongest_neighbour_1phase_ptr, strongest_neighbour_ptr, bsize, 1, this->m_merge_singletons);
+        cudaCheckError();
+
+        // 2nd phase: for each block_row, find the strongest neighbour among those who gave hand on 1st phase
+        findStrongestNeighbourBlockDiaCsr_V2<<<num_blocks,threads_per_block,0,this->m_stream>>>(A_row_offsets_ptr, A_column_indices_ptr, weights_ptr, n, aggregates_ptr, strongest_neighbour_1phase_ptr, strongest_neighbour_ptr, bsize, 2, this->m_merge_singletons);
+        cudaCheckError();
+      }
+
+      // Look for perfect matches. Also, for nodes without unaggregated neighbours, merge with aggregate containing strongest neighbour
+      matchEdges<<<num_blocks,threads_per_block,0,this->m_stream>>>(n, aggregates_ptr, strongest_neighbour_ptr);
+      cudaCheckError();
+
+#ifdef EXPERIMENTAL_ITERATIVE_MATCHING
+      s = (icount & 1);
+      if( s == 0 ) 
+      {
+        // count unaggregated vertices
+        cudaMemsetAsync(d_unaggregated, 0, sizeof(int), this->m_stream);
+        countAggregates<IndexType,threads_per_block><<<num_blocks,threads_per_block,0,this->m_stream>>>(n, aggregates_ptr, d_unaggregated);
+        cudaCheckError();
+
+        cudaMemcpyAsync(unaggregated, d_unaggregated, sizeof(int), cudaMemcpyDeviceToHost, this->m_stream);
+        throttle_event->record(this->m_stream);
+        cudaCheckError();
+      }
+      else 
+      {
+        throttle_event->sync();
+
+        numUnassigned_previous = numUnassigned;
+        numUnassigned = *unaggregated;
+      }
+#else
+      cudaStreamSynchronize(this->m_stream);
+      numUnassigned_previous = numUnassigned;
+      numUnassigned = (int)thrust::count(aggregates_thrust_dev_ptr, aggregates_thrust_dev_ptr+n,-1);
+      cudaCheckError();
+#endif
+
+      icount++;
+    } while ( (s == 0) || !(numUnassigned==0 || icount > this->m_max_iterations || 1.0*numUnassigned/n < this->m_numUnassigned_tol || numUnassigned == numUnassigned_previous));
+  }
+  
+  //print
+  //printf("icount=%i, numUnassiged=%d, numUnassigned_tol=%f\n", icount, numUnassigned, this->m_numUnassigned_tol);
+
+#ifdef EXPERIMENTAL_ITERATIVE_MATCHING
+  delete throttle_event;
+#endif
+
+  if( this->m_merge_singletons )
+  {
+    // Merge remaining vertices with current aggregates
+    if (!this->m_deterministic)
+    {
+      while (numUnassigned != 0) 
+      {
+        mergeWithExistingAggregatesBlockDiaCsr_V2<<<num_blocks,threads_per_block,0,this->m_stream>>>(A_row_offsets_ptr, A_column_indices_ptr, edge_weights_ptr, n, aggregates_ptr, bsize,this->m_deterministic,(IndexType*) NULL);
+        cudaCheckError();
+
+         numUnassigned = (int)thrust::count(aggregates_thrust_dev_ptr, aggregates_thrust_dev_ptr+n,-1);
+        cudaCheckError();
+      }
+
+    }
+    else 
+    {
+      Vector<int> aggregates_candidate(n);
+      aggregates_candidate.fill(-1);
+
+      while (numUnassigned != 0) 
+      {
+        mergeWithExistingAggregatesBlockDiaCsr_V2<<<num_blocks,threads_per_block,0,this->m_stream>>>(A_row_offsets_ptr, A_column_indices_ptr, edge_weights_ptr, n, aggregates_ptr, bsize,this->m_deterministic,aggregates_candidate.raw());
+        cudaCheckError();
+
+        joinExistingAggregates<<<num_blocks,threads_per_block,0,this->m_stream>>>(n, aggregates_ptr, aggregates_candidate.raw());
+        cudaCheckError();
+
+        numUnassigned = (int)thrust::count(aggregates_thrust_dev_ptr, aggregates_thrust_dev_ptr+n,-1);
+        cudaCheckError();
+      }
+    }
+  }
+  else
+  {
+      //make singletons
+      aggregateSingletons<<<num_blocks,threads_per_block,0,this->m_stream>>>( aggregates_ptr, n );
+      cudaCheckError();
+  }
+
+    renumberAndCountAggregates(aggregates, n, num_aggregates);
+
+    return NVGRAPH_OK; 
+}
+
+template <typename IndexType, typename ValueType>
+NVGRAPH_ERROR Size2Selector<IndexType, ValueType>::setAggregates(const ValuedCsrGraph<IndexType, ValueType> &A, Vector<IndexType> &aggregates, int &num_aggregates)
+{
+    return setAggregates_common_sqblocks( A, aggregates, num_aggregates);
+}
+
+template class Size2Selector<int, float>;
+template class Size2Selector<int, double>;
+template void renumberAndCountAggregates  <int> (Vector<int> &aggregates, const int n, int& num_aggregates);
+
+} //nvgraph
diff --git a/cpp/nvgraph/cpp/src/sssp.cu b/cpp/nvgraph/cpp/src/sssp.cu
new file mode 100644
index 00000000000..47ba109561c
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/sssp.cu
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define NEW_CSRMV
+
+#include <algorithm>
+#include <iomanip>
+#include "valued_csr_graph.hxx"
+#include "nvgraph_vector.hxx"
+#include "nvgraph_cusparse.hxx"
+#include "nvgraph_cublas.hxx"
+#include "nvgraph_error.hxx"
+#include "nvgraph_csrmv.hxx"
+#include "sssp.hxx"
+#ifdef NEW_CSRMV
+#include "csrmv_cub.h"
+#include "cub_semiring/cub.cuh"
+#endif
+#include <cfloat>
+#include "debug_macros.h"
+#ifdef DEBUG
+  #define SP_VERBOSE 0
+#endif
+namespace nvgraph
+{
+template <typename IndexType_, typename ValueType_>
+void Sssp<IndexType_, ValueType_>::setup(IndexType source_index, Vector<ValueType>& source_connection,  Vector<ValueType>& sssp_result)
+{
+    
+#ifdef DEBUG
+    int n = static_cast<int>(m_network.get_num_vertices());
+    if (n != static_cast<int>(source_connection.get_size()) || n != static_cast<int>(sssp_result.get_size()) || !( source_index>=0 && source_index<n) )
+    {
+        CERR() << "n : " << n << std::endl;
+        CERR() << "source_index : " << source_index << std::endl;
+        CERR() << "source_connection : " << source_connection.get_size() << std::endl;
+        CERR() << "sssp_result : " << sssp_result.get_size() << std::endl;
+        FatalError("Wrong input vector in SSSP solver.", NVGRAPH_ERR_BAD_PARAMETERS);
+    }
+#endif
+    m_source = source_index;
+    m_tmp = source_connection;
+    m_sssp = sssp_result;
+    //m_mask.allocate(n, m_stream);
+    //m_mask.fill(1, m_stream);
+    m_is_setup = true;
+}
+template <typename IndexType_, typename ValueType_>
+bool Sssp<IndexType_, ValueType_>::solve_it()
+{
+    int n = static_cast<int>(m_network.get_num_vertices()), nnz =  static_cast<int>(m_network.get_num_edges());
+    int inc = 1;
+    ValueType_ tolerance =  static_cast<float>( 1.0E-6);
+    ValueType *sssp = m_sssp.raw(),  *tmp = m_tmp.raw(); //initially set y equal to x
+    // int *mask = m_mask.raw();
+    
+#ifdef NEW_CSRMV
+    ValueType_ alpha = cub_semiring::cub::MinPlusSemiring<ValueType_>::times_ident();
+    ValueType_ beta = cub_semiring::cub::MinPlusSemiring<ValueType_>::times_ident();
+    SemiringDispatch<IndexType_, ValueType_>::template Dispatch< cub_semiring::cub::MinPlusSemiring<ValueType_> >(
+        m_network.get_raw_values(),
+        m_network.get_raw_row_offsets(),
+        m_network.get_raw_column_indices(),
+        tmp,
+        sssp,
+        alpha,
+        beta, 
+        n,
+        n,
+        nnz,
+        m_stream);
+#else
+    ValueType_  alpha = 0.0, beta = 0.0; //times_ident = 0 for MinPlus semiring
+#if __cplusplus > 199711L
+    Semiring SR = Semiring::MinPlus;
+#else
+    Semiring SR = MinPlus;
+#endif
+    // y = Network^T op x op->plus x
+    // *op* is (plus : min, time : +)
+    
+    /***************************
+    ---> insert csrmv_mp here
+    - semiring: (min, +)
+    - mask: m_mask
+    - parameters:
+           (n, n, nnz, 
+           alpha,
+           m_network,
+           tmp,
+           beta,
+           sssp);
+    ****************************/
+    csrmv_mp<IndexType_, ValueType_>(n, n, nnz,
+                                    alpha,
+                                    m_network,
+                                    tmp,
+                                    beta,
+                                    sssp,
+                                    SR, 
+                                    m_stream);
+#endif
+    // CVG check : ||tmp - sssp||
+    Cublas::axpy(n, (ValueType_)-1.0, sssp, inc, tmp, inc);
+    m_residual = Cublas::nrm2(n, tmp, inc);
+    if (m_residual < tolerance) 
+    {
+        return true;
+    }
+    else
+    {
+        // we do the convergence check by computing the norm two of tmp = sssp(n-1) - sssp(n)
+        // hence if tmp[i] = 0, sssp[i] hasn't changed so we can skip the i th column at the n+1 iteration
+        //m_tmp.flag_zeros(m_mask, m_stream);
+        m_tmp.copy(m_sssp, m_stream);
+        return false;
+    }
+}
+template <typename IndexType_, typename ValueType_>
+NVGRAPH_ERROR Sssp<IndexType_, ValueType_>::solve(IndexType source_index, Vector<ValueType>& source_connection, Vector<ValueType>&  sssp_result)
+{
+    setup(source_index, source_connection, sssp_result);
+    bool converged = false;
+    int max_it = static_cast<int>(m_network.get_num_edges()), i = 0;
+
+
+    #ifdef SP_VERBOSE
+        //int n = static_cast<int>(m_network.get_num_vertices()), nnz =  static_cast<int>(m_network.get_num_edges());
+        //dump_raw_vec(m_network.get_raw_row_offsets(), n, 0);
+        //dump_raw_vec(m_network.get_raw_column_indices(),n, 0);
+        //dump_raw_vec(m_network.get_raw_values(), nnz, 0);
+
+        std::stringstream ss;
+        ss.str(std::string());
+        size_t used_mem, free_mem, total_mem;
+        ss <<" --------------------Sssp--------------------"<< std::endl;
+        ss <<" --------------------------------------------"<< std::endl;
+        ss << std::setw(10) << "Iteration" << std::setw(20) << " Mem Usage (MB)" << std::setw(15) << "Residual" << std::endl;
+        ss <<" --------------------------------------------"<< std::endl;
+        COUT()<<ss.str();
+    #endif
+    while (!converged && i < max_it)
+    {
+        converged = solve_it();
+        i++;
+         #ifdef SP_VERBOSE
+            ss.str(std::string());
+            cnmemMemGetInfo(&free_mem, &total_mem, NULL);
+            used_mem=total_mem-free_mem;
+            ss << std::setw(10) << i ;
+            ss.precision(3);
+            ss << std::setw(20) << std::fixed << used_mem/1024.0/1024.0;
+            ss << std::setw(15) << std::scientific << m_residual  << std::endl;
+            COUT()<<ss.str();
+        #endif
+    }
+    m_iterations = i;
+    #ifdef SP_VERBOSE
+        COUT() <<" --------------------------------------------"<< std::endl;
+    #endif
+    return converged ? NVGRAPH_OK : NVGRAPH_ERR_NOT_CONVERGED;
+}
+template class Sssp<int, double>;
+template class Sssp<int, float>;
+} // end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/src/triangles_counting.cpp b/cpp/nvgraph/cpp/src/triangles_counting.cpp
new file mode 100644
index 00000000000..b8d40a8af31
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/triangles_counting.cpp
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */ 
+#include <triangles_counting.hxx>
+#include <triangles_counting_kernels.hxx>
+#include <thrust/sequence.h>
+
+namespace nvgraph
+{
+
+namespace triangles_counting
+{
+
+template <typename IndexType>
+TrianglesCount<IndexType>::TrianglesCount(const CsrGraph <IndexType>& graph, cudaStream_t stream, int device_id)
+{
+    m_stream = stream;
+    m_done = true;
+    if (device_id == -1)
+        cudaGetDevice(&m_dev_id);
+    else
+        m_dev_id = device_id;
+
+    cudaGetDeviceProperties(&m_dev_props, m_dev_id);
+    cudaCheckError();
+    cudaSetDevice(m_dev_id);
+    cudaCheckError();
+
+    // fill spmat struct;
+    m_mat.nnz = graph.get_num_edges();
+    m_mat.N = graph.get_num_vertices();
+    m_mat.roff_d = graph.get_raw_row_offsets();
+    m_mat.cols_d = graph.get_raw_column_indices();
+
+    m_seq.allocate(m_mat.N, stream);
+    create_nondangling_vector(m_mat.roff_d, m_seq.raw(), &(m_mat.nrows), m_mat.N, m_stream); 
+    m_mat.rows_d = m_seq.raw();
+}
+
+template <typename IndexType>
+TrianglesCount<IndexType>::~TrianglesCount()
+{
+    cudaSetDevice(m_dev_id);
+    cudaCheckError();
+}
+
+template <typename IndexType>
+void TrianglesCount<IndexType>::tcount_bsh()
+{
+    //printf("TrianglesCount: %s\n", __func__); fflush(stdout);
+    
+    if (m_dev_props.sharedMemPerBlock*8 < (size_t)m_mat.nrows) 
+    {
+        FatalError("Number of vertices to high to use this kernel!", NVGRAPH_ERR_BAD_PARAMETERS);
+    }
+
+    unsigned int    *bmap_d;
+    size_t      bmld = DIV_UP(m_mat.N,8*sizeof(*bmap_d));
+
+    bmld = 8ull*DIV_UP(bmld*sizeof(*bmap_d), 8);
+    bmld /= sizeof(*bmap_d);
+    
+    //size_t bmap_sz = sizeof(*bmap_d)*bmld;
+    int nblock = m_mat.nrows;
+
+    Vector<uint64_t> ocnt_d(nblock);
+    cudaMemset(ocnt_d.raw(), 0, ocnt_d.bytes());
+    cudaCheckError();
+
+    tricnt_bsh(nblock, &m_mat, ocnt_d.raw(), bmld, m_stream);
+
+    m_triangles_number = reduce(ocnt_d.raw(), nblock, m_stream);
+}
+
+template <typename IndexType>
+void TrianglesCount<IndexType>::tcount_b2b()
+{
+
+    //printf("TrianglesCount: %s\n", __func__); fflush(stdout);
+
+    // allocate a big enough array for output
+
+    Vector<uint64_t> ocnt_d(m_mat.nrows);
+    cudaMemset(ocnt_d.raw(), 0, ocnt_d.bytes());
+    cudaCheckError();
+
+    // allocate level 1 bitmap
+    Vector<unsigned int> bmapL1_d;
+    size_t bmldL1 = DIV_UP(m_mat.N,8*sizeof(*bmapL1_d.raw()));
+
+    // make the size a multiple of 8 bytes, for zeroing in kernel...    
+    bmldL1 = 8ull*DIV_UP(bmldL1*sizeof(*bmapL1_d.raw()), 8);
+    bmldL1 /= sizeof(*bmapL1_d.raw());
+
+    size_t free_bytes, total_bytes;
+    cudaMemGetInfo(&free_bytes, &total_bytes);
+    cudaCheckError();
+
+    int nblock = (free_bytes*95/100) / (sizeof(*bmapL1_d.raw())*bmldL1);//@TODO: what?
+    nblock = MIN(nblock, m_mat.nrows);
+
+    size_t bmapL1_sz = sizeof(*bmapL1_d.raw())*bmldL1*nblock;
+
+    bmapL1_d.allocate(bmldL1*nblock);
+    //cuda 8.0 : memory past 16th GB may not be set with cudaMemset(),
+    //CHECK_CUDA(cudaMemset(bmapL1_d, 0, bmapL1_sz));
+    myCudaMemset((unsigned long long *)bmapL1_d.raw(), 0ull, bmapL1_sz/8, m_stream);
+
+    // allocate level 0 bitmap
+    Vector<unsigned int> bmapL0_d;
+    size_t          bmldL0 = DIV_UP(DIV_UP(m_mat.N, BLK_BWL0), 8*sizeof(*bmapL0_d.raw()));
+
+    bmldL0 = 8ull*DIV_UP(bmldL0*sizeof(*bmapL0_d.raw()), 8);
+    bmldL0 /= sizeof(*bmapL0_d.raw());
+
+    size_t bmapL0_sz = sizeof(*bmapL0_d.raw())*nblock*bmldL0;
+    bmapL0_d.allocate(nblock*bmldL0);
+
+    myCudaMemset((unsigned long long *)bmapL0_d.raw(), 0ull, bmapL0_sz/8, m_stream);
+    tricnt_b2b(nblock, &m_mat, ocnt_d.raw(), bmapL0_d.raw(), bmldL0, bmapL1_d.raw(), bmldL1, m_stream);
+    m_triangles_number = reduce(ocnt_d.raw(), nblock, m_stream);
+}
+
+template <typename IndexType>
+void TrianglesCount<IndexType>::tcount_wrp()
+{
+    //printf("TrianglesCount: %s\n", __func__); fflush(stdout);
+
+    // allocate a big enough array for output
+    Vector<uint64_t> ocnt_d;
+    size_t ocnt_sz = DIV_UP(m_mat.nrows, (THREADS/32));
+    ocnt_d.allocate(ocnt_sz);
+
+    cudaMemset(ocnt_d.raw(), 0, ocnt_d.bytes());
+    cudaCheckError();
+
+    Vector<unsigned int> bmap_d;
+    size_t      bmld = DIV_UP(m_mat.N,8*sizeof(*bmap_d.raw()));
+
+    // make the size a multiple of 8 bytes, for zeroing in kernel...    
+    bmld = 8ull*DIV_UP(bmld*sizeof(*bmap_d.raw()), 8);
+    bmld /= sizeof(*bmap_d.raw());
+
+    // number of blocks limited by birmap size
+    size_t free_bytes, total_bytes;
+    cudaMemGetInfo(&free_bytes, &total_bytes);
+    cudaCheckError();
+
+    int nblock = (free_bytes*95/100) / (sizeof(*bmap_d.raw())*bmld*(THREADS/32));
+    nblock = MIN(nblock, DIV_UP(m_mat.nrows, (THREADS/32)));
+    //int maxblocks = props.multiProcessorCount * props.maxThreadsPerMultiProcessor / THREADS;
+    //nblock = MIN(nblock, maxblocks);
+
+    size_t bmap_sz = bmld*nblock*(THREADS/32);
+
+    bmap_d.allocate(bmap_sz);
+    //CUDA 8.0 memory past 16th GB may not be set with cudaMemset()
+    //CHECK_CUDA(cudaMemset(bmap_d, 0, bmap_sz));
+    myCudaMemset((unsigned long long *)bmap_d.raw(), 0ull, bmap_sz*sizeof(*bmap_d.raw())/8, m_stream);
+
+    tricnt_wrp(nblock, &m_mat, ocnt_d.raw(), bmap_d.raw(), bmld, m_stream);
+    m_triangles_number = reduce(ocnt_d.raw(), nblock, m_stream);
+}
+
+template <typename IndexType>
+void TrianglesCount<IndexType>::tcount_thr()
+{
+    //printf("TrianglesCount: %s\n", __func__); fflush(stdout);
+
+    int maxblocks = m_dev_props.multiProcessorCount * m_dev_props.maxThreadsPerMultiProcessor / THREADS;
+
+    int nblock = MIN(maxblocks, DIV_UP(m_mat.nrows,THREADS));
+
+    Vector<uint64_t> ocnt_d(nblock);
+
+    cudaMemset(ocnt_d.raw(), 0, ocnt_d.bytes());
+    cudaCheckError();
+
+    tricnt_thr(nblock, &m_mat, ocnt_d.raw(), m_stream);
+    m_triangles_number = reduce(ocnt_d.raw(), nblock, m_stream);
+}
+
+template <typename IndexType>
+NVGRAPH_ERROR TrianglesCount<IndexType>::count(TrianglesCountAlgo algo)
+{
+    switch(algo)
+    {
+        case TCOUNT_BSH:
+            tcount_bsh();
+            break;
+        case TCOUNT_B2B:
+            tcount_b2b();
+            break;
+        case TCOUNT_WRP:
+            tcount_wrp();
+            break;
+        case TCOUNT_THR:
+            tcount_thr();
+            break;
+        case TCOUNT_DEFAULT:
+            {
+                double mean_deg = (double)m_mat.nnz / m_mat.nrows;
+                if      (mean_deg <  DEG_THR1) tcount_thr();
+                else if (mean_deg <  DEG_THR2) tcount_wrp();
+                else 
+                {
+                    const int shMinBlkXSM = 6;
+                    if (m_dev_props.sharedMemPerBlock*8/shMinBlkXSM < (size_t)m_mat.N)
+                        tcount_b2b();
+                    else    
+                        tcount_bsh();
+                }
+            }
+            break;
+        default:
+            FatalError("Bad algorithm specified for triangles counting", NVGRAPH_ERR_BAD_PARAMETERS);
+    }
+    m_event.record();
+    return NVGRAPH_OK;
+}
+
+template class TrianglesCount<int>;
+
+} // end namespace triangle counting
+
+} // end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/src/triangles_counting_kernels.cu b/cpp/nvgraph/cpp/src/triangles_counting_kernels.cu
new file mode 100644
index 00000000000..843e1ff2743
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/triangles_counting_kernels.cu
@@ -0,0 +1,1030 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef __cplusplus
+#define __STDC_LIMIT_MACROS 1
+#define __STDC_FORMAT_MACROS 1
+#endif
+#include <cuda.h>
+
+#include <assert.h>
+
+#include <triangles_counting_defines.hxx>
+#include <triangles_counting_kernels.hxx>
+
+#include <nvgraph_error.hxx>
+
+#include "cub/cub.cuh"
+#include <thrust/iterator/counting_iterator.h>
+#include "sm_utils.h"
+using namespace cub;
+
+#include "cnmem.h"
+
+#define TH_CENT_K_LOCLEN (34)
+#define WP_LEN_TH1 (24)
+#define WP_LEN_TH2 (2)
+
+#if WP_LEN_TH1 > 32
+#error WP_LEN_TH1 must be <= 32!
+#endif
+
+template <typename T>
+__device__ __forceinline__ T LDG(const T* x) 
+{ 
+#if __CUDA_ARCH__ < 350
+    return *x;
+#else
+    return __ldg(x);
+#endif
+}
+
+
+namespace nvgraph
+{
+
+namespace triangles_counting
+{
+
+// hide behind 
+void* tmp_get(size_t size, cudaStream_t stream)
+{
+    void *t = NULL;
+    cnmemStatus_t status = cnmemMalloc(&t, size, stream);
+    if( status == CNMEM_STATUS_OUT_OF_MEMORY) 
+    {
+        FatalError("Not enough memory", NVGRAPH_ERR_NO_MEMORY);
+    }
+    else if (status != CNMEM_STATUS_SUCCESS)
+    {
+      FatalError("Memory manager internal error (alloc)", NVGRAPH_ERR_UNKNOWN);        
+    }
+
+    return t;
+}
+
+void tmp_release(void* ptr, cudaStream_t stream)
+{
+    cnmemStatus_t status = cnmemFree(ptr, stream);
+    if (status != CNMEM_STATUS_SUCCESS)
+    {
+      FatalError("Memory manager internal error (release)", NVGRAPH_ERR_UNKNOWN);        
+    }
+}
+
+// cub utility wrappers ////////////////////////////////////////////////////////
+template<typename InputIteratorT,
+     typename OutputIteratorT,
+     typename ReductionOpT,
+     typename T>
+static inline void cubReduce(InputIteratorT d_in, OutputIteratorT d_out,
+                 int num_items, ReductionOpT reduction_op,
+                 T init, cudaStream_t stream=0,
+                 bool debug_synchronous=false) {
+
+    void    *d_temp_storage = NULL;
+    size_t    temp_storage_bytes = 0;
+
+    cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,
+                         d_in, d_out, num_items, reduction_op,
+                         init, stream, debug_synchronous);
+    cudaCheckError();
+    d_temp_storage = tmp_get(temp_storage_bytes, stream);
+    cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,
+                         d_in, d_out, num_items, reduction_op,
+                         init, stream, debug_synchronous);
+    cudaCheckError();
+    tmp_release(d_temp_storage, stream);
+
+    return;
+}
+
+template<typename InputIteratorT , typename OutputIteratorT >
+static inline void cubSum(InputIteratorT d_in, OutputIteratorT d_out,
+              int num_items, cudaStream_t stream=0,
+              bool debug_synchronous=false) {
+
+    void    *d_temp_storage = NULL;
+    size_t    temp_storage_bytes = 0;
+    
+    cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes,
+                      d_in, d_out, num_items, stream,
+                      debug_synchronous);
+    cudaCheckError();
+    d_temp_storage = tmp_get(temp_storage_bytes, stream);
+    cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes,
+                      d_in, d_out, num_items, stream,
+                      debug_synchronous);
+    cudaCheckError();
+    tmp_release(d_temp_storage, stream);
+
+    return;
+}
+
+template <typename KeyT>
+static inline void cubSortKeys(KeyT *d_keys_in, KeyT *d_keys_out, int num_items,
+                   int begin_bit=0, int end_bit=sizeof(KeyT)*8,
+                   cudaStream_t stream=0, bool debug_synchronous=false) {
+
+    void    *d_temp_storage = NULL;
+    size_t    temp_storage_bytes = 0;
+    
+    cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes,
+                          d_keys_in, d_keys_out, num_items,
+                          begin_bit, end_bit, stream,
+                          debug_synchronous);
+    cudaCheckError();
+    d_temp_storage = tmp_get(temp_storage_bytes, stream);
+    cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes,
+                          d_keys_in, d_keys_out, num_items,
+                          begin_bit, end_bit, stream,
+                          debug_synchronous);
+    cudaCheckError();
+    tmp_release(d_temp_storage, stream);
+
+    return;
+}
+
+template<typename KeyT , typename ValueT >
+static inline void cubSortPairs(KeyT *d_keys_in, KeyT *d_keys_out,
+                ValueT *d_values_in, ValueT *d_values_out,
+                int num_items, int begin_bit=0, int end_bit=sizeof(KeyT)*8,
+                cudaStream_t stream=0, bool debug_synchronous=false) {
+
+    void    *d_temp_storage = NULL;
+    size_t    temp_storage_bytes = 0;
+    
+    cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+                           d_keys_in, d_keys_out, d_values_in,
+                           d_values_out, num_items, begin_bit,
+                           end_bit, stream, debug_synchronous);
+    cudaCheckError();
+    d_temp_storage = tmp_get(temp_storage_bytes, stream);
+    cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+                           d_keys_in, d_keys_out, d_values_in,
+                           d_values_out, num_items, begin_bit,
+                           end_bit, stream, debug_synchronous);
+    cudaCheckError();
+    tmp_release(d_temp_storage, stream);
+
+    return;
+}
+
+template<typename KeyT , typename ValueT >
+static inline void cubSortPairsDescending(KeyT *d_keys_in, KeyT *d_keys_out,
+                      ValueT *d_values_in, ValueT *d_values_out,
+                      int num_items, int begin_bit=0, int end_bit=sizeof(KeyT)*8,
+                      cudaStream_t stream=0, bool debug_synchronous=false) {
+    void    *d_temp_storage = NULL;
+    size_t    temp_storage_bytes = 0;
+    
+    cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+                                 d_keys_in, d_keys_out, d_values_in,
+                                 d_values_out, num_items, begin_bit,
+                                 end_bit, stream, debug_synchronous);
+    cudaCheckError();
+    d_temp_storage = tmp_get(temp_storage_bytes, stream);
+    cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+                                 d_keys_in, d_keys_out, d_values_in,
+                                 d_values_out, num_items, begin_bit,
+                                 end_bit, stream, debug_synchronous);
+    tmp_release(d_temp_storage, stream);
+
+    return;
+}
+
+template<typename InputIteratorT,
+     typename OutputIteratorT ,
+     typename NumSelectedIteratorT>
+static inline void cubUnique(InputIteratorT d_in, OutputIteratorT d_out,
+                 NumSelectedIteratorT d_num_selected_out, int num_items,
+                 cudaStream_t stream=0, bool debug_synchronous=false) {
+  
+    void    *d_temp_storage = NULL;
+    size_t    temp_storage_bytes = 0;
+    
+    cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes,
+                         d_in, d_out, d_num_selected_out,
+                         num_items, stream, debug_synchronous);
+    cudaCheckError();
+    d_temp_storage = tmp_get(temp_storage_bytes, stream);
+    cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes,
+                         d_in, d_out, d_num_selected_out,
+                         num_items, stream, debug_synchronous);
+    cudaCheckError();
+    tmp_release(d_temp_storage, stream);
+
+    return;
+}
+
+template <typename InputIteratorT,
+      typename UniqueOutputIteratorT,
+          typename LengthsOutputIteratorT,
+          typename NumRunsOutputIteratorT>
+static inline void cubEncode(InputIteratorT d_in, UniqueOutputIteratorT d_unique_out,
+                 LengthsOutputIteratorT d_counts_out, NumRunsOutputIteratorT d_num_runs_out,
+                 int num_items, cudaStream_t stream=0, bool debug_synchronous=false) {
+
+    void    *d_temp_storage = NULL;
+    size_t    temp_storage_bytes = 0;
+    
+    cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes,
+                              d_in, d_unique_out, d_counts_out,
+                              d_num_runs_out, num_items, stream,
+                              debug_synchronous);
+    cudaCheckError();
+    d_temp_storage = tmp_get(temp_storage_bytes, stream);
+    cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes,
+                              d_in, d_unique_out, d_counts_out,
+                              d_num_runs_out, num_items, stream,
+                              debug_synchronous);
+    cudaCheckError();
+    tmp_release(d_temp_storage, stream);
+
+    return;
+}
+
+template <typename InputIteratorT,
+          typename OutputIteratorT>
+static inline void cubMin(InputIteratorT d_in, OutputIteratorT d_out,
+              int num_items, cudaStream_t stream=0,
+              bool debug_synchronous=false) {
+
+    
+    void    *d_temp_storage = NULL;
+    size_t    temp_storage_bytes = 0;
+    
+    cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes,
+                      d_in, d_out, num_items, stream,
+                      debug_synchronous);
+    cudaCheckError();
+    d_temp_storage = tmp_get(temp_storage_bytes, stream);
+    cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes,
+                      d_in, d_out, num_items, stream,
+                      debug_synchronous);
+    cudaCheckError();
+    tmp_release(d_temp_storage, stream);
+
+    return;
+}
+
+template <typename InputIteratorT,
+          typename OutputIteratorT>
+static inline void cubMax(InputIteratorT d_in, OutputIteratorT d_out,
+              int num_items, cudaStream_t stream=0,
+              bool debug_synchronous=false) {
+
+    
+    void    *d_temp_storage = NULL;
+    size_t    temp_storage_bytes = 0;
+    
+    cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes,
+                      d_in, d_out, num_items, stream,
+                      debug_synchronous);
+    cudaCheckError();
+    d_temp_storage = tmp_get(temp_storage_bytes, stream);
+    cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes,
+                      d_in, d_out, num_items, stream,
+                      debug_synchronous);
+    cudaCheckError();
+    tmp_release(d_temp_storage, stream);
+
+    return;
+}
+
+template <typename InputIteratorT,
+      typename OutputIteratorT,
+      typename NumSelectedIteratorT,
+      typename SelectOp>
+static inline void cubIf(InputIteratorT d_in, OutputIteratorT d_out,
+             NumSelectedIteratorT d_num_selected_out,
+             int num_items, SelectOp select_op,
+             cudaStream_t stream=0, bool debug_synchronous=false) {
+
+    void    *d_temp_storage = NULL;
+    size_t    temp_storage_bytes = 0;
+    
+    cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes,
+                     d_in, d_out, d_num_selected_out,
+                     num_items, select_op, stream,
+                     debug_synchronous);
+    cudaCheckError();
+    d_temp_storage = tmp_get(temp_storage_bytes, stream);
+    cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes,
+                     d_in, d_out, d_num_selected_out,
+                     num_items, select_op, stream,
+                     debug_synchronous);
+    cudaCheckError();
+    tmp_release(d_temp_storage, stream);
+
+    return;
+}
+
+template <typename InputIteratorT,
+      typename FlagIterator,
+      typename OutputIteratorT,
+      typename NumSelectedIteratorT>
+static inline void cubFlagged(InputIteratorT d_in, FlagIterator d_flags,
+                  OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out,
+                  int num_items, cudaStream_t stream=0,
+                  bool debug_synchronous=false) {
+
+    void    *d_temp_storage = NULL;
+    size_t    temp_storage_bytes = 0;
+
+    cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes,
+                          d_in, d_flags, d_out, d_num_selected_out,
+                          num_items, stream, debug_synchronous);
+    cudaCheckError();
+    d_temp_storage = tmp_get(temp_storage_bytes, stream);
+    cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes,
+                          d_in, d_flags, d_out, d_num_selected_out,
+                          num_items, stream, debug_synchronous);
+    cudaCheckError();
+    tmp_release(d_temp_storage, stream);
+
+    return;
+}
+
+template <typename InputIteratorT,
+      typename OutputIteratorT>
+static inline void cubExclusiveSum(InputIteratorT d_in, OutputIteratorT d_out,
+                   int num_items, cudaStream_t stream=0,
+                   bool debug_synchronous=false) {
+
+    void    *d_temp_storage = NULL;
+    size_t    temp_storage_bytes = 0;
+
+    cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,
+                         d_in, d_out, num_items, stream,
+                         debug_synchronous);
+    cudaCheckError();
+    d_temp_storage = tmp_get(temp_storage_bytes, stream);
+    cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,
+                         d_in, d_out, num_items, stream,
+                         debug_synchronous);
+    cudaCheckError();
+    tmp_release(d_temp_storage, stream);
+
+    return;
+}
+
+template <typename InputIteratorT,
+      typename OutputIteratorT>
+static inline void cubInclusiveSum(InputIteratorT d_in, OutputIteratorT d_out,
+                   int num_items, cudaStream_t stream=0,
+                   bool debug_synchronous=false) {
+
+    void    *d_temp_storage = NULL;
+    size_t    temp_storage_bytes = 0;
+
+    cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes,
+                         d_in, d_out, num_items, stream,
+                         debug_synchronous);
+    cudaCheckError();
+    d_temp_storage = tmp_get(temp_storage_bytes, stream);
+    cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes,
+                         d_in, d_out, num_items, stream,
+                         debug_synchronous);
+    cudaCheckError();
+    tmp_release(d_temp_storage, stream);
+
+    return;
+}
+
+template <typename KeysInputIteratorT,
+      typename UniqueOutputIteratorT,
+      typename ValuesInputIteratorT,
+      typename AggregatesOutputIteratorT,
+      typename NumRunsOutputIteratorT,
+      typename ReductionOpT>
+static inline void cubReduceByKey(KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out,
+                  ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out,
+                  NumRunsOutputIteratorT d_num_runs_out, ReductionOpT reduction_op,
+                  int num_items, cudaStream_t stream=0, bool debug_synchronous=false) {
+
+    void    *d_temp_storage = NULL;
+    size_t    temp_storage_bytes = 0;
+
+    cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes,
+                          d_keys_in, d_unique_out,
+                          d_values_in, d_aggregates_out,
+                          d_num_runs_out, reduction_op,
+                          num_items, stream, debug_synchronous);
+    cudaCheckError();
+    d_temp_storage = tmp_get(temp_storage_bytes, stream);
+    cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes,
+                          d_keys_in, d_unique_out,
+                          d_values_in, d_aggregates_out,
+                          d_num_runs_out, reduction_op,
+                          num_items, stream, debug_synchronous);
+    cudaCheckError();
+    tmp_release(d_temp_storage, stream);
+
+    return;
+}
+
+template <typename T2>
+__device__ __host__ inline bool operator==(const T2 &lhs, const T2 &rhs) { 
+        return (lhs.x == rhs.x && lhs.y == rhs.y);
+}
+
+
+//////////////////////////////////////////////////////////////////////////////////////////
+template<typename T>
+__device__ T __block_bcast(const T v, const int x) {
+
+    __shared__ T shv;
+
+    __syncthreads();
+    if (threadIdx.x == x) shv = v;
+    __syncthreads();
+
+    return shv;
+}
+
+template<int BDIM_X,
+     int BDIM_Y,
+     int WSIZE,
+     typename T>
+__device__ __forceinline__ T block_sum(T v) {
+
+    __shared__ T sh[BDIM_X*BDIM_Y/WSIZE];
+
+    const int lid = threadIdx.x%32;
+    const int wid = threadIdx.x/32 + ((BDIM_Y > 1) ? threadIdx.y*(BDIM_X/32) : 0);
+
+    #pragma unroll
+    for(int i = WSIZE/2; i; i >>= 1) {
+        v += utils::shfl_down(v, i);
+    }
+    if (lid == 0) sh[wid] = v;
+
+    __syncthreads();
+    if (wid == 0) {
+        v = (lid < (BDIM_X*BDIM_Y/WSIZE)) ? sh[lid] : 0;
+
+        #pragma unroll
+        for(int i = (BDIM_X*BDIM_Y/WSIZE)/2; i; i >>= 1) {
+            v += utils::shfl_down(v, i);
+        }
+    }
+    return v;
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+template<int BDIM,
+     int WSIZE,
+     int BWL0,
+     typename ROW_T,
+     typename OFF_T,
+     typename CNT_T,
+     typename MAP_T>
+__global__ void tricnt_b2b_k(const ROW_T ner,
+                 const ROW_T *__restrict__ rows,
+                 const OFF_T *__restrict__ roff,
+                 const ROW_T *__restrict__ cols,
+                 CNT_T *__restrict__ ocnt,
+                 MAP_T *__restrict__ bmapL0,
+                             const size_t bmldL0,
+                 MAP_T *__restrict__ bmapL1,
+                 const size_t bmldL1) {
+    CNT_T __cnt = 0;
+
+    bmapL1 += bmldL1*blockIdx.x;
+    bmapL0 += bmldL0*blockIdx.x;
+    for(ROW_T bid = blockIdx.x; bid < ner; bid += gridDim.x) {
+
+        const OFF_T rbeg = roff[rows[bid]];
+        const OFF_T rend = roff[rows[bid]+1];
+
+        ROW_T firstcol=0;
+        ROW_T lastcol=0;
+
+        for(OFF_T i = rbeg; i < rend; i += BDIM) {
+            const ROW_T c = (i+threadIdx.x < rend) ? cols[i+threadIdx.x] : -1;
+
+            __syncthreads();
+            if (c > -1) {
+                atomicOr(bmapL1 + c/BITSOF(bmapL1), ((MAP_T)1) << (c%BITSOF(bmapL1)));
+                atomicOr(bmapL0 + c/BWL0/BITSOF(bmapL0), ((MAP_T)1) << ((c/BWL0)%BITSOF(bmapL0)));
+            }
+            __syncthreads();
+
+            #pragma unroll
+            for(int j = 0; j < BDIM; j++) {
+
+                const ROW_T curc = __block_bcast(c, j);
+                if (curc == -1) break;
+
+                lastcol = curc;
+                if ((i == rbeg) && !j) {
+                    firstcol = curc;
+                    continue;
+                }
+                const OFF_T soff = roff[curc];
+                const OFF_T eoff = roff[curc+1];
+
+                for(OFF_T k = eoff-1; k >= soff; k -= BDIM) {
+                    if (k-(int)threadIdx.x < soff) break;
+
+                    const ROW_T cc = LDG(cols + k - threadIdx.x);
+                    if (cc < firstcol) break;
+
+                    MAP_T mm = ((MAP_T)1) << ((cc/BWL0)%BITSOF(bmapL0));
+                    if (0 == (bmapL0[cc/BWL0/BITSOF(bmapL0)] & mm)) continue;
+                    
+                    mm = ((MAP_T)1) << (cc%BITSOF(bmapL1));
+                    if (bmapL1[cc/BITSOF(bmapL1)] & mm) {
+                        __cnt++;
+                    }
+                }
+            }
+        }
+
+        lastcol /= 64;
+        firstcol /= 64;
+
+        __syncthreads();
+        for(int i = rbeg; i < rend; i += BDIM) {
+            if (i+threadIdx.x < rend) {
+                ROW_T c = cols[i+threadIdx.x];
+                bmapL1[c/BITSOF(bmapL1)] = 0;
+                bmapL0[c/BWL0/BITSOF(bmapL0)] = 0;
+            }
+        }
+        __syncthreads();
+    }
+
+    __cnt = block_sum<BDIM, 1, WSIZE>(__cnt);
+    if (threadIdx.x == 0) ocnt[blockIdx.x] = __cnt;
+
+    return;
+}
+
+template <typename T>
+void tricnt_b2b(T nblock, spmat_t<T> *m, uint64_t *ocnt_d, unsigned int *bmapL0_d, size_t bmldL0, unsigned int *bmapL1_d, size_t bmldL1, cudaStream_t stream) {
+
+    // still best overall (with no psum)
+    tricnt_b2b_k<THREADS, 32, BLK_BWL0><<<nblock, THREADS, 0, stream>>>(m->nrows, m->rows_d,
+                                 m->roff_d, m->cols_d, ocnt_d,
+                                 bmapL0_d, bmldL0,
+                                 bmapL1_d, bmldL1);
+    cudaCheckError();
+    return;
+}
+//////////////////////////////////////////////////////////////////////////////////////////
+template<int BDIM_X,
+     int BDIM_Y,
+     int WSIZE,
+     typename T>
+__device__ __forceinline__ T block_sum_sh(T v, T *sh) {
+
+    const int lid = threadIdx.x%32;
+    const int wid = threadIdx.x/32 + ((BDIM_Y > 1) ? threadIdx.y*(BDIM_X/32) : 0);
+
+    #pragma unroll
+    for(int i = WSIZE/2; i; i >>= 1) {
+        v += utils::shfl_down(v, i);
+    }
+    if (lid == 0) sh[wid] = v;
+
+    __syncthreads();
+    if (wid == 0) {
+        v = (lid < (BDIM_X*BDIM_Y/WSIZE)) ? sh[lid] : 0;
+
+        #pragma unroll
+        for(int i = (BDIM_X*BDIM_Y/WSIZE)/2; i; i >>= 1) {
+            v += utils::shfl_down(v, i);
+        }
+    }
+    return v;
+}
+
+template<int BDIM,
+     int WSIZE,
+     typename ROW_T,
+     typename OFF_T,
+     typename CNT_T>
+__global__ void tricnt_bsh_k(const ROW_T ner,
+                   const ROW_T *__restrict__ rows,
+                   const OFF_T *__restrict__ roff,
+                   const ROW_T *__restrict__ cols,
+                   CNT_T *__restrict__ ocnt,
+                   const size_t bmld) {
+    CNT_T __cnt = 0;
+    extern __shared__ unsigned int shm[];
+
+    for(int i = 0; i < bmld; i += BDIM) {
+        if (i+threadIdx.x < bmld) {
+            shm[i+threadIdx.x] = 0;
+        }
+    }
+
+    for(ROW_T bid = blockIdx.x; bid < ner; bid += gridDim.x) {
+
+        const OFF_T rbeg = roff[rows[bid]];
+        const OFF_T rend = roff[rows[bid]+1];
+
+        ROW_T firstcol=0;
+        ROW_T lastcol=0;
+
+        for(OFF_T i = rbeg; i < rend; i += BDIM) {
+            const ROW_T c = (i+threadIdx.x < rend) ? cols[i+threadIdx.x] : -1;
+
+            __syncthreads();
+            if (c > -1) atomicOr(shm + c/BITSOF(shm), 1u << (c%BITSOF(shm)));
+            __syncthreads();
+
+            #pragma unroll
+            for(int j = 0; j < BDIM; j++) {
+
+                const ROW_T curc = __block_bcast(c, j);
+                if (curc == -1) break;
+
+                lastcol = curc;
+                if ((i == rbeg) && !j) {
+                    firstcol = curc;
+                    continue;
+                }
+
+                const OFF_T soff = roff[curc];
+                const OFF_T eoff = roff[curc+1];
+                for(OFF_T k = eoff-1; k >= soff; k -= BDIM) {
+                    if (k-(int)threadIdx.x < soff) break;
+
+                    const ROW_T cc = LDG(cols + k - threadIdx.x);
+                    if (cc < firstcol) break;
+
+                    const unsigned int mm = 1u << (cc%BITSOF(shm));
+                    if (shm[cc/BITSOF(shm)] & mm) {
+                        __cnt++;
+                    }
+                }
+            }
+        }
+        lastcol /= 64;
+        firstcol /= 64;
+
+        __syncthreads();
+        if (lastcol-firstcol < rend-rbeg) {
+            for(int i = firstcol; i <= lastcol; i += BDIM) {
+                if (i+threadIdx.x <= lastcol) {
+                    ((unsigned long long *)shm)[i+threadIdx.x] = 0ull;
+                }
+            }
+        } else {
+            for(int i = rbeg; i < rend; i += BDIM) {
+                if (i+threadIdx.x < rend) {
+                    shm[cols[i+threadIdx.x]/BITSOF(shm)] = 0;
+                }
+            }
+        }
+        __syncthreads();
+    }
+    __cnt = block_sum_sh<BDIM, 1, WSIZE>(__cnt, (uint64_t *)shm);
+    if (threadIdx.x == 0) ocnt[blockIdx.x] = __cnt;
+
+    return;
+}
+
+template <typename T>
+void tricnt_bsh(T nblock, spmat_t<T> *m, uint64_t *ocnt_d, size_t bmld, cudaStream_t stream) {
+
+    tricnt_bsh_k<THREADS,32><<<nblock, THREADS, sizeof(unsigned int)*bmld, stream>>>(m->nrows, m->rows_d,
+                                         m->roff_d, m->cols_d,
+                                         ocnt_d, bmld);
+    cudaCheckError();
+    return;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////
+template<int WSIZE,
+     int NWARP,
+         int RLEN_THR1,
+     int RLEN_THR2,
+     typename ROW_T,
+     typename OFF_T,
+     typename CNT_T,
+     typename MAP_T>
+__global__ void tricnt_wrp_ps_k(const ROW_T ner,
+                const ROW_T *__restrict__ rows,
+                const OFF_T *__restrict__ roff,
+                const ROW_T *__restrict__ cols,
+                CNT_T *__restrict__ ocnt,
+                MAP_T *__restrict__ bmap,
+                const size_t bmld) {
+
+    __shared__ OFF_T sho[NWARP][WSIZE];
+    __shared__ ROW_T shs[NWARP][WSIZE];
+    __shared__ ROW_T shc[NWARP][WSIZE];
+
+    CNT_T __cnt = 0;
+    ROW_T wid = blockIdx.x*blockDim.y + threadIdx.y;
+
+    bmap += bmld*wid;
+    for(; wid < ner; wid += gridDim.x*blockDim.y) {
+
+        const OFF_T rbeg = roff[rows[wid]];
+        const OFF_T rend = roff[rows[wid]+1];
+
+        //RLEN_THR1 <= 32
+        if (rend-rbeg <= RLEN_THR1) {
+            const int nloc = rend-rbeg;
+
+            OFF_T soff;
+            OFF_T eoff;
+            if (threadIdx.x < nloc) {
+                const ROW_T c = cols[rbeg+threadIdx.x];
+                shc[threadIdx.y][threadIdx.x] = c;
+                soff = roff[c];
+                eoff = roff[c+1];
+            }
+
+            int mysm=-1;
+            #pragma unroll
+                        for(int i = 1; i < RLEN_THR1; i++) {
+                
+                                if (i == nloc) break;
+
+                                const OFF_T csoff = utils::shfl(soff, i);
+                                const OFF_T ceoff = utils::shfl(eoff, i);
+
+                if (ceoff-csoff < RLEN_THR2) {
+                    if (threadIdx.x == i) mysm = i;
+                    continue;
+                }
+                                for(OFF_T k = ceoff-1; k >= csoff; k -= WSIZE) {
+                                        if (k-(int)threadIdx.x < csoff) break;
+
+                    const ROW_T cc = cols[k-threadIdx.x];
+                    if (cc < shc[threadIdx.y][0]) break;
+                    for(int j = i-1; j >= 0; j--) {
+                        if (cc == shc[threadIdx.y][j]) {__cnt++;}
+                    }
+                                }
+                        }
+            if (mysm > -1) {
+                for(OFF_T k = eoff-1; k >= soff; k--) {
+                    const ROW_T cc = cols[k];
+                    if (cc < shc[threadIdx.y][0]) break;
+                    for(int j = mysm-1; j >= 0; j--) {
+                        if (cc == shc[threadIdx.y][j]) {__cnt++;}
+                    }
+                }
+            }
+        } else {
+            ROW_T firstcol=cols[rbeg];
+            ROW_T lastcol=cols[rend-1];
+            for(OFF_T i = rbeg; i < rend; i += 32) {
+                
+                const ROW_T c = (i+threadIdx.x < rend) ? cols[i+threadIdx.x] : -1;
+
+                if (c > -1) atomicOr(bmap + c/BITSOF(bmap), ((MAP_T)1) << (c%BITSOF(bmap)));
+
+                sho[threadIdx.y][threadIdx.x] = (c > -1) ? roff[c] : 0;
+
+                ROW_T len = (c > -1) ? roff[c+1]-sho[threadIdx.y][threadIdx.x] : 0;
+                ROW_T lensum = len;
+
+                #pragma unroll
+                for(int j = 1; j < 32; j <<= 1) {
+                    lensum += (threadIdx.x >= j)*(utils::shfl_up(lensum, j));
+                }
+                shs[threadIdx.y][threadIdx.x] = lensum-len;
+
+                lensum = utils::shfl(lensum, 31);
+
+                int k=WSIZE-1;
+                for(int j = lensum-1; j >= 0; j -= WSIZE) {
+
+                    if (j < threadIdx.x) break;
+
+                    // bisect-right
+                    for(; k >= 0; k--) {
+                        if (shs[threadIdx.y][k] <= j-threadIdx.x) break;
+                    }
+
+                    const ROW_T cc = LDG( cols + (sho[threadIdx.y][k] + j-threadIdx.x-shs[threadIdx.y][k]) );
+
+                    if (cc < firstcol) continue;
+
+                    const MAP_T mm = ((MAP_T)1) << (cc%BITSOF(bmap));
+                    if (bmap[cc/BITSOF(bmap)] & mm) {
+                        __cnt++;
+                    }
+                }
+            }
+            lastcol /= 64;
+            firstcol /= 64;
+
+            if (lastcol-firstcol < rend-rbeg) {
+                for(int i = firstcol; i <= lastcol; i += WSIZE) {
+                    if (i+threadIdx.x <= lastcol) {
+                        ((unsigned long long *)bmap)[i+threadIdx.x] = 0ull;
+                    }
+                }
+            } else {
+                for(int i = rbeg; i < rend; i += WSIZE) {
+                    if (i+threadIdx.x < rend) {
+                        bmap[cols[i+threadIdx.x]/BITSOF(bmap)] = 0;
+                    }
+                }
+            }
+        }
+    }
+    __syncthreads();
+    __cnt = block_sum<WSIZE, NWARP, WSIZE>(__cnt);
+    if (threadIdx.x == 0 && threadIdx.y == 0) {
+        ocnt[blockIdx.x] = __cnt;
+    }
+    return;
+}
+
+template <typename T>
+void tricnt_wrp(T nblock, spmat_t<T> *m, uint64_t *ocnt_d, unsigned int *bmap_d, size_t bmld, cudaStream_t stream) {
+
+    dim3 block(32, THREADS/32);
+    tricnt_wrp_ps_k<32,THREADS/32, WP_LEN_TH1, WP_LEN_TH2><<<nblock, block, 0, stream>>>(m->nrows, m->rows_d,
+                                          m->roff_d, m->cols_d,
+                                          ocnt_d, bmap_d, bmld);
+    cudaCheckError();
+    return;
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+template<int BDIM,
+     int LOCLEN,
+     typename ROW_T,
+     typename OFF_T,
+     typename CNT_T>
+__global__ void tricnt_thr_k(const ROW_T ner,
+                 const ROW_T *__restrict__ rows,
+                 const OFF_T *__restrict__ roff,
+                 const ROW_T *__restrict__ cols,
+                 CNT_T *__restrict__ ocnt) {
+    CNT_T __cnt = 0;
+    const ROW_T tid = blockIdx.x*BDIM + threadIdx.x;
+
+    for(ROW_T rid = tid; rid < ner; rid += gridDim.x*BDIM) {
+
+        const ROW_T r = rows[rid];
+
+        const OFF_T rbeg = roff[r];
+        const OFF_T rend = roff[r+1];
+        const ROW_T rlen = rend-rbeg;
+        
+        if (!rlen) continue;
+        if (rlen <= LOCLEN) {
+            int   nloc = 0;
+            ROW_T loc[LOCLEN];
+
+            #pragma unroll
+            for(nloc = 0; nloc < LOCLEN; nloc++) {
+                if (rbeg+nloc >= rend) break;
+                loc[nloc] = LDG(cols + rbeg + nloc);
+            }
+
+            #pragma unroll
+            for(int i = 1; i < LOCLEN; i++) {
+        
+                if (i == nloc) break;
+
+                const ROW_T c = loc[i];
+                const OFF_T soff = roff[c];
+                const OFF_T eoff = roff[c+1];
+
+                for(OFF_T k = eoff-1; k >= soff; k--) {
+
+                    const ROW_T cc = LDG(cols + k);
+                    if (cc < loc[0]) break;
+
+                    for(int j = i-1; j >= 0; j--) {
+                        if (cc == loc[j]) __cnt++;
+                    }
+                }
+            }
+        } else {
+            const ROW_T minc = cols[rbeg];
+            for(int i = 1; i < rlen; i++) {
+        
+                const ROW_T c = LDG(cols + rbeg + i);
+                const OFF_T soff = roff[c];
+                const OFF_T eoff = roff[c+1];
+
+                for(OFF_T k = eoff-1; k >= soff; k--) {
+
+                    const ROW_T cc = LDG(cols + k);
+                    if (cc < minc) break;
+
+                    for(int j = i-1; j >= 0; j--) {
+                        if (cc == LDG(cols + rbeg + j)) __cnt++;
+                    }
+                }
+            }
+        }
+    }
+
+    __syncthreads();
+    __cnt = block_sum<BDIM, 1, 32>(__cnt);
+    if (threadIdx.x == 0) ocnt[blockIdx.x] = __cnt;
+
+    return;
+}
+
+template <typename T>
+void tricnt_thr(T nblock, spmat_t<T> *m, uint64_t *ocnt_d, cudaStream_t stream) {
+
+    cudaFuncSetCacheConfig(tricnt_thr_k<THREADS, TH_CENT_K_LOCLEN, typename type_utils<T>::LOCINT, typename type_utils<T>::LOCINT, uint64_t>, cudaFuncCachePreferL1);
+
+    tricnt_thr_k<THREADS, TH_CENT_K_LOCLEN><<<nblock, THREADS, 0, stream>>>(m->nrows, m->rows_d,
+                                     m->roff_d, m->cols_d,
+                                     ocnt_d);
+    cudaCheckError();
+    return;
+}
+
+/////////////////////////////////////////////////////////////////
+__global__ void myset(unsigned long long *p, unsigned long long v, long long n) {
+    const long long tid = blockIdx.x*blockDim.x + threadIdx.x;
+    if (tid < n) {
+        p[tid] = v;
+    }
+    return;
+}
+
+void myCudaMemset(unsigned long long *p, unsigned long long v, long long n, cudaStream_t stream) {
+    if (n <= 0) return;
+    myset<<<DIV_UP(n,THREADS), THREADS, 0, stream>>>(p, v, n);
+    cudaCheckError();
+}
+
+template <typename IndexType>
+struct NonEmptyRow
+{
+    const IndexType* p_roff;
+    __host__ __device__ NonEmptyRow(const IndexType* roff) : p_roff(roff) {}
+    __host__ __device__ __forceinline__
+    bool operator()(const IndexType &a) const 
+    {
+        return (p_roff[a] < p_roff[a+1]);
+    }
+};
+
+template <typename T>
+void create_nondangling_vector(const T* roff, T *p_nonempty, T *n_nonempty, size_t n, cudaStream_t stream)
+{
+    if (n <= 0) return;
+    thrust::counting_iterator<T> it(0);
+    NonEmptyRow<T> temp_func(roff);
+    T* d_out_num = (T*)tmp_get(sizeof(*n_nonempty), stream);
+
+    cubIf(it, p_nonempty, d_out_num, n, temp_func, stream);
+    cudaMemcpy(n_nonempty, d_out_num, sizeof(*n_nonempty), cudaMemcpyDeviceToHost);
+    cudaCheckError();
+    tmp_release(d_out_num, stream);
+    cudaCheckError();
+}
+
+template <typename T>
+uint64_t reduce(uint64_t *v_d, T n, cudaStream_t stream) {
+
+    uint64_t n_h;
+    uint64_t *n_d = (uint64_t *)tmp_get(sizeof(*n_d), stream);
+
+    cubSum(v_d, n_d, n, stream);
+    cudaCheckError();
+    cudaMemcpy(&n_h, n_d, sizeof(*n_d), cudaMemcpyDeviceToHost);
+    cudaCheckError();
+    tmp_release(n_d, stream);
+
+    return n_h;
+}
+
+
+// instantiate for int
+template void tricnt_bsh<int>(int nblock, spmat_t<int> *m, uint64_t *ocnt_d, size_t bmld, cudaStream_t stream);
+template void tricnt_wrp<int>(int nblock, spmat_t<int> *m, uint64_t *ocnt_d, unsigned int *bmap_d, size_t bmld, cudaStream_t stream);
+template void tricnt_thr<int>(int nblock, spmat_t<int> *m, uint64_t *ocnt_d, cudaStream_t stream);
+template void tricnt_b2b<int>(int nblock, spmat_t<int> *m, uint64_t *ocnt_d, unsigned int *bmapL0_d, size_t bmldL0, unsigned int *bmapL1_d, size_t bmldL1, cudaStream_t stream);
+
+template uint64_t reduce<int>(uint64_t *v_d, int n, cudaStream_t stream);
+template void create_nondangling_vector<int>(const int *roff, int *p_nonempty, int *n_nonempty, size_t n, cudaStream_t stream);
+
+} // end namespace triangle counting
+
+} // end namespace nvgraph
diff --git a/cpp/nvgraph/cpp/src/valued_csr_graph.cpp b/cpp/nvgraph/cpp/src/valued_csr_graph.cpp
new file mode 100644
index 00000000000..3882c1607c2
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/valued_csr_graph.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "valued_csr_graph.hxx"
+#include "cnmem_shared_ptr.hxx" // interface with CuMem (memory pool lib) for shared ptr
+
+namespace nvgraph
+{
+    template <typename IndexType_, typename ValueType_>
+    ValuedCsrGraph<IndexType_, ValueType_>& ValuedCsrGraph<IndexType_, ValueType_>::operator=(const ValuedCsrGraph<IndexType_, ValueType_>& graph)
+    {
+    	
+    }
+
+}
+
diff --git a/cpp/nvgraph/cpp/src/widest_path.cu b/cpp/nvgraph/cpp/src/widest_path.cu
new file mode 100644
index 00000000000..4da42856574
--- /dev/null
+++ b/cpp/nvgraph/cpp/src/widest_path.cu
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define NEW_CSRMV
+
+#include <algorithm>
+#include <iomanip>
+#include <cfloat>
+#include "nvgraph_error.hxx"
+#include "valued_csr_graph.hxx"
+#include "nvgraph_vector.hxx"
+#include "nvgraph_cublas.hxx"
+#ifdef NEW_CSRMV
+#include "csrmv_cub.h"
+#include "cub_semiring/cub.cuh"
+#endif
+#include "nvgraph_csrmv.hxx"
+#include "widest_path.hxx"
+
+#include "debug_macros.h"
+#ifdef DEBUG
+#define MF_VERBOSE 0
+#endif
+namespace nvgraph
+{
+template <typename IndexType_, typename ValueType_>
+void WidestPath<IndexType_, ValueType_>::setup(IndexType source_index, Vector<ValueType>& source_connection,  Vector<ValueType>& widest_path_result)
+{
+    
+#ifdef DEBUG
+    int n = static_cast<int>(m_network.get_num_vertices());
+    if (n != static_cast<int>(source_connection.get_size()) || n != static_cast<int>(widest_path_result.get_size()) || !( source_index>=0 && source_index<n) )
+    {
+        CERR() << "n : " << n << std::endl;
+        CERR() << "source_index : " << source_index << std::endl;
+        CERR() << "source_connection : " << source_connection.get_size() << std::endl;
+        CERR() << "widest_path_result : " << widest_path_result.get_size() << std::endl;
+        FatalError("Wrong input vector in WidestPath solver.", NVGRAPH_ERR_BAD_PARAMETERS);
+    }
+#endif
+    m_source = source_index;
+    m_tmp = source_connection;
+    m_widest_path = widest_path_result;
+    //m_mask.allocate(n);
+    m_is_setup = true;
+}
+template <typename IndexType_, typename ValueType_>
+bool WidestPath<IndexType_, ValueType_>::solve_it()
+{
+    int n = static_cast<int>(m_network.get_num_vertices()), nnz =  static_cast<int>(m_network.get_num_edges());
+    int inc = 1;
+    ValueType_ tolerance = static_cast<float>( 1.0E-6);
+    ValueType *widest_path = m_widest_path.raw(),  *tmp = m_tmp.raw();
+    // int *mask = m_mask.raw();
+    // y = Network^T op x op->plus x
+    // *op* is (plus : max, time : min)
+    
+    /***************************
+    ---> insert csrmv_mp here
+    - semiring: (max, min)
+    - mask: m_mask    // not implemented in csrmv
+    - parameters:
+           (n, n, nnz, 
+           alpha,
+           m_network,
+           tmp,
+           beta,
+           widest_path);
+    ****************************/
+  
+    // About setting alpha & beta
+    // 1. The general Csrmv_mp_sr does :
+    //     y = alpha op->time A op->time x op->plus beta op->time y
+    // 2. SR = MaxMin has :
+    //     plus_ident = SR_type(-inf);
+    //     times_ident = SR_type(inf);
+    //     times_null = SR_type(-inf);
+    // 3. In order to solve : 
+    //     y = Network^T op x op->plus x
+    //     We need alpha = times_ident
+    //                     beta   = times_ident
+    
+
+#ifdef NEW_CSRMV
+    ValueType_ alpha = cub_semiring::cub::MaxMinSemiring<ValueType_>::times_ident();
+    ValueType_ beta = cub_semiring::cub::MaxMinSemiring<ValueType_>::times_ident();
+    SemiringDispatch<IndexType_, ValueType_>::template Dispatch< cub_semiring::cub::MaxMinSemiring<ValueType_> >(
+        m_network.get_raw_values(),
+        m_network.get_raw_row_offsets(),
+        m_network.get_raw_column_indices(),
+        tmp,
+        widest_path,
+        alpha,
+        beta, 
+        n,
+        n,
+        nnz,
+        m_stream);
+#else
+
+    ValueType_ inf; 
+    if (typeid(ValueType_) == typeid(float)) 
+        inf = FLT_MAX ;
+    else if (typeid(ValueType_) == typeid(double)) 
+         inf = DBL_MAX ;
+    else
+        FatalError("Graph value type is not supported by this semiring.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+    ValueType_  alpha = inf, beta = inf;
+#if __cplusplus > 199711L
+    Semiring SR = Semiring::MaxMin;
+#else // new csrmv
+    Semiring SR = MaxMin;
+#endif
+
+    csrmv_mp<IndexType_, ValueType_>(n, n, nnz,
+                                    alpha,
+                                    m_network,
+                                    tmp,
+                                    beta,
+                                    widest_path,
+                                    SR, 
+                                    m_stream);
+#endif // new csrmv
+    // CVG check : ||tmp - widest_path||
+    Cublas::axpy(n, (ValueType_)-1.0, widest_path, inc, tmp, inc);
+    m_residual = Cublas::nrm2(n, tmp, inc);
+    if (m_residual < tolerance) 
+    {
+        return true;
+    }
+    else
+    {
+        // we do the convergence check by computing the norm two of tmp = widest_path(n-1) - widest_path(n)
+        // hence if tmp[i] = 0, widest_path[i] hasn't changed so we can skip the i th column at the n+1 iteration
+        // m_tmp.flag_zeros(m_mask);
+        m_tmp.copy(m_widest_path); // we want x+1 =  Ax +x and csrmv does y = Ax+y, so we copy x in y here.
+        return false;
+    }
+}
+template <typename IndexType_, typename ValueType_>
+NVGRAPH_ERROR WidestPath<IndexType_, ValueType_>::solve(IndexType source_index, Vector<ValueType>& source_connection, Vector<ValueType>&  widest_path_result)
+{
+    setup(source_index, source_connection, widest_path_result);
+    bool converged = false;
+    int max_it = 100000, i = 0;
+    #ifdef MF_VERBOSE
+        std::stringstream ss;
+        ss.str(std::string());
+        size_t used_mem, free_mem, total_mem;
+        ss <<" ------------------WidestPath------------------"<< std::endl;
+        ss <<" --------------------------------------------"<< std::endl;
+        ss << std::setw(10) << "Iteration" << std::setw(20) << " Mem Usage (MB)" << std::setw(15) << "Residual" << std::endl;
+        ss <<" --------------------------------------------"<< std::endl;
+        COUT()<<ss.str();
+    #endif
+    while (!converged && i < max_it)
+    {
+        converged = solve_it();
+        i++;
+         #ifdef MF_VERBOSE
+            ss.str(std::string());
+            cnmemMemGetInfo(&free_mem, &total_mem, NULL);
+            used_mem=total_mem-free_mem;
+            ss << std::setw(10) << i ;
+            ss.precision(3);
+            ss << std::setw(20) << std::fixed << used_mem/1024.0/1024.0;
+            ss << std::setw(15) << std::scientific << m_residual  << std::endl;
+            COUT()<<ss.str();
+        #endif
+    }
+    m_iterations = i;
+    #ifdef MF_VERBOSE
+        COUT() <<" --------------------------------------------"<< std::endl;
+    #endif
+    return converged ? NVGRAPH_OK : NVGRAPH_ERR_NOT_CONVERGED;
+}
+template class WidestPath<int, double>;
+template class WidestPath<int, float>;
+} // end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/tests/2d_partitioning_test.cpp b/cpp/nvgraph/cpp/tests/2d_partitioning_test.cpp
new file mode 100644
index 00000000000..20fb4ee86d3
--- /dev/null
+++ b/cpp/nvgraph/cpp/tests/2d_partitioning_test.cpp
@@ -0,0 +1,53 @@
+#include "gtest/gtest.h"
+#include "nvgraph.h"
+#include <iostream>
+
+TEST(SimpleBFS2D, DummyTest) {
+	nvgraphHandle_t handle;
+	int* devices = (int*) malloc(sizeof(int) * 2);
+	devices[0] = 0;
+	devices[1] = 1;
+	nvgraphCreateMulti(&handle, 2, devices);
+	nvgraphGraphDescr_t graph;
+	nvgraphCreateGraphDescr(handle, &graph);
+	int rowIds[38] = { 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5,
+			5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 8 };
+	int colIds[38] = { 1, 2, 7, 8, 0, 2, 4, 7, 8, 0, 1, 3, 6, 8, 2, 4, 5, 6, 8, 1, 3, 5, 8, 3, 4, 6,
+			7, 2, 3, 5, 0, 1, 5, 0, 1, 2, 3, 4 };
+	nvgraph2dCOOTopology32I_st topo;
+	topo.nvertices = 9;
+	topo.nedges = 38;
+	topo.source_indices = rowIds;
+	topo.destination_indices = colIds;
+	topo.valueType = CUDA_R_32I;
+	topo.values = NULL;
+	topo.numDevices = 2;
+	topo.devices = devices;
+	topo.blockN = 2;
+	topo.tag = NVGRAPH_DEFAULT;
+	nvgraphSetGraphStructure(handle, graph, &topo, NVGRAPH_2D_32I_32I);
+	int* distances = (int*) malloc(sizeof(int) * 9);
+	int* predecessors = (int*) malloc(sizeof(int) * 9);
+	int sourceId = 0;
+	std::cout << "Source ID: " << sourceId << "\n";
+	nvgraph2dBfs(handle, graph, sourceId, distances, predecessors);
+	std::cout << "Distances:\n";
+	for (int i = 0; i < 9; i++)
+		std::cout << i << ":" << distances[i] << "  ";
+	std::cout << "\nPredecessors:\n";
+	for (int i = 0; i < 9; i++)
+		std::cout << i << ":" << predecessors[i] << "  ";
+	std::cout << "\n";
+	int exp_pred[9] = {-1,0,0,2,1,7,2,0,0};
+	int exp_dist[9] = {0,1,1,2,2,2,2,1,1};
+	for (int i = 0; i < 9; i++){
+		ASSERT_EQ(exp_pred[i], predecessors[i]);
+		ASSERT_EQ(exp_dist[i], distances[i]);
+	}
+	std::cout << "Test run!\n";
+}
+
+int main(int argc, char **argv) {
+	::testing::InitGoogleTest(&argc, argv);
+	return RUN_ALL_TESTS();
+}
diff --git a/cpp/nvgraph/cpp/tests/CMakeLists.txt b/cpp/nvgraph/cpp/tests/CMakeLists.txt
new file mode 100644
index 00000000000..eda3443f398
--- /dev/null
+++ b/cpp/nvgraph/cpp/tests/CMakeLists.txt
@@ -0,0 +1,157 @@
+﻿cmake_minimum_required(VERSION 3.12 FATAL_ERROR)
+
+project(CUDF_TESTS LANGUAGES C CXX CUDA)
+
+###################################################################################################
+# - compiler function -----------------------------------------------------------------------------
+
+function(ConfigureTest CMAKE_TEST_NAME CMAKE_TEST_SRC)
+    add_executable(${CMAKE_TEST_NAME} ${CMAKE_TEST_SRC})
+    set_target_properties(${CMAKE_TEST_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_link_libraries(${CMAKE_TEST_NAME} gmock gtest gmock_main gtest_main pthread nvgraph_rapids cublas cusparse curand cusolver cudart)
+    set_target_properties(${CMAKE_TEST_NAME} PROPERTIES
+                            RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/gtests")
+    add_test(NAME ${CMAKE_TEST_NAME} COMMAND ${CMAKE_TEST_NAME})
+endfunction(ConfigureTest)
+
+###################################################################################################
+# - include paths ---------------------------------------------------------------------------------
+
+include_directories(
+                    "${CMAKE_BINARY_DIR}/include"
+                    "${CMAKE_SOURCE_DIR}/include"
+                    "${CMAKE_SOURCE_DIR}/thirdparty/cnmem/include"
+                    "${CMAKE_SOURCE_DIR}/thirdparty/cub"
+                    "${CMAKE_SOURCE_DIR}/../external"
+                    "${CMAKE_SOURCE_DIR}/../external/cusp"
+                    "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}"
+                   )
+
+###################################################################################################
+# - library paths ---------------------------------------------------------------------------------
+
+link_directories("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" # CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES is an undocumented/unsupported variable containing the link directories for nvcc
+                 "${CMAKE_BINARY_DIR}/lib"
+                 "${GTEST_LIBRARY_DIR}")
+
+###################################################################################################
+### test sources ##################################################################################
+###################################################################################################
+
+###################################################################################################
+# - triangles tests -------------------------------------------------------------------------------------
+
+set(TRIANGLES_TEST_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/nvgraph_capi_tests_triangles.cpp")
+
+ConfigureTest(TRIANGLES_TEST "${TRIANGLES_TEST_SRC}")
+
+###################################################################################################
+# - nvgraph tests -------------------------------------------------------------------------------------
+
+set(NVGRAPH_TEST_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/nvgraph_test.cpp"
+   )
+
+ConfigureTest(NVGRAPH_TEST "${NVGRAPH_TEST_SRC}")
+
+###################################################################################################
+# - 2d_partitioning -------------------------------------------------------------------------------
+
+set(2DPARTITIONING_TEST_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/2d_partitioning_test.cpp"
+   )
+
+ConfigureTest(2DPARTITIONING_TEST "${2DPARTITIONING_TEST_SRC}")
+
+###################################################################################################
+# - nvgraph_benchmark -----------------------------------------------------------------------------
+
+set(NVGRAPH_BENCHMARK_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/nvgraph_benchmark.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/mmio.c"
+   )
+
+ConfigureTest(NVGRAPH_BENCHMARK "${NVGRAPH_BENCHMARK_SRC}")
+
+###################################################################################################
+# - nvgraph_capi_tests_2d_bfs ---------------------------------------------------------------------
+
+set(NVGRAPH_CAPI_TESTS_2D_BFS_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/nvgraph_capi_tests_2d_bfs.cpp"
+   )
+
+ConfigureTest(NVGRAPH_CAPI_TESTS_2D_BFS "${NVGRAPH_CAPI_TESTS_2D_BFS_SRC}")
+
+###################################################################################################
+# - nvgraph_capi_tests_2d_bfs_net -----------------------------------------------------------------
+
+set(NVGRAPH_CAPI_TESTS_2D_BFS_NET_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/nvgraph_capi_tests_2d_bfs_net.cpp"
+   )
+
+ConfigureTest(NVGRAPH_CAPI_TESTS_2D_BFS_NET "${NVGRAPH_CAPI_TESTS_2D_BFS_NET_SRC}")
+
+###################################################################################################
+# - nvgraph_capi_tests_algorithms -----------------------------------------------------------------
+
+set(NVGRAPH_CAPI_TESTS_ALGORITHMS_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/nvgraph_capi_tests_algorithms.cpp"
+   )
+
+ConfigureTest(NVGRAPH_CAPI_TESTS_ALGORITHMS "${NVGRAPH_CAPI_TESTS_ALGORITHMS_SRC}")
+
+###################################################################################################
+# - nvgraph_capi_tests_clustering -----------------------------------------------------------------
+
+set(NVGRAPH_CAPI_TESTS_CLUSTERING_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/nvgraph_capi_tests_clustering.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/mmio.c"
+   )
+
+ConfigureTest(NVGRAPH_CAPI_TESTS_CLUSTERING "${NVGRAPH_CAPI_TESTS_CLUSTERING_SRC}")
+
+###################################################################################################
+# - nvgraph_capi_tests_contraction ----------------------------------------------------------------
+if(NOT NVGRAPH_LIGHT)
+set(NVGRAPH_CAPI_TESTS_CONTRACTION_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/nvgraph_capi_tests_contraction.cpp"
+   )
+
+ConfigureTest(NVGRAPH_CAPI_TESTS_CONTRACTION "${NVGRAPH_CAPI_TESTS_CONTRACTION_SRC}")
+endif(NOT NVGRAPH_LIGHT)
+
+###################################################################################################
+# - nvgraph_capi_test_conversion ------------------------------------------------------------------
+
+set(NVGRAPH_CAPI_TEST_CONVERSION_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/nvgraph_capi_tests_conversion.cpp"
+   )
+
+ConfigureTest(NVGRAPH_CAPI_TEST_CONVERSION "${NVGRAPH_CAPI_TEST_CONVERSION_SRC}")
+
+###################################################################################################
+# - nvgraph_capi_tests_subgraph -------------------------------------------------------------------
+
+set(NVGRAPH_CAPI_TESTS_SUBGRAPH_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/nvgraph_capi_tests_subgraph.cpp"
+   )
+
+ConfigureTest(NVGRAPH_CAPI_TESTS_SUBGRAPH "${NVGRAPH_CAPI_TESTS_SUBGRAPH_SRC}")
+
+###################################################################################################
+# - nvgraph_capi_tests_traversal ------------------------------------------------------------------
+
+set(NVGRAPH_CAPI_TESTS_TRAVERSAL_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/nvgraph_capi_tests_traversal.cpp"
+   )
+
+ConfigureTest(NVGRAPH_CAPI_TESTS_TRAVERSAL "${NVGRAPH_CAPI_TESTS_TRAVERSAL_SRC}")
+
+
+###################################################################################################
+### enable testing ################################################################################
+###################################################################################################
+
+enable_testing()
+
diff --git a/cpp/nvgraph/cpp/tests/benchmarkScripts/modularity_paper.sh b/cpp/nvgraph/cpp/tests/benchmarkScripts/modularity_paper.sh
new file mode 100644
index 00000000000..a02cef07fdf
--- /dev/null
+++ b/cpp/nvgraph/cpp/tests/benchmarkScripts/modularity_paper.sh
@@ -0,0 +1,143 @@
+#!/bin/bash
+
+# ****************** Edit this *************************
+
+#Path to nvgraph bin graphs
+# From p4matrices:2024 sync //matrices/p4matrices/dimacs10/...
+nvg_data_prefix="/home/mnaumov/cuda_matrices/p4matrices/dimacs10"
+#nvg_data_prefix="/home/afender/modularity/mat"
+
+#Path to nvgraph 
+# nvg_bin_prefix should contain a release build of nvgraph's ToT (from p4sw //sw/gpgpu/nvgraph/...)
+# and nvgraph_benchmark executable which is build along with nvgraph's tests
+nvg_bin_prefix="/home/afender/modularity/sw/gpgpu/bin/x86_64_Linux_release"
+
+# *****************************************************
+
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$nvg_bin_prefix
+export PATH=$PATH:$nvg_bin_prefix
+
+declare -a dataset=(
+"$nvg_data_prefix/preferentialAttachment.mtx"
+"$nvg_data_prefix/caidaRouterLevel.mtx"
+"$nvg_data_prefix/coAuthorsDBLP.mtx"
+"$nvg_data_prefix/citationCiteseer.mtx"
+"$nvg_data_prefix/coPapersDBLP.mtx"
+"$nvg_data_prefix/coPapersCiteseer.mtx"
+"/home/afender/modularity/as-Skitter.mtx"
+"/home/afender/modularity/hollywood-2009.mtx"
+#"$nvg_data_prefix/data.mtx"
+#"/home/afender/modularity/karate.mtx"
+#"$nvg_data_prefix/road_central.mtx"
+#"$nvg_data_prefix/road_usa.mtx"
+#"$nvg_data_prefix/rgg_n_2_23_s0.mtx"
+)
+
+#One particular number of cluster
+for i in "${dataset[@]}"
+do
+   $nvg_bin_prefix/nvgraph_benchmark --modularity "$i" 7 7 --double --repeats 4
+done
+echo 
+for i in "${dataset[@]}"
+do
+   $nvg_bin_prefix/nvgraph_benchmark --modularity "$i" 7 7 --float --repeats 4
+done
+echo 
+
+#Spreadsheet 1
+#declare -ia clusters=(2 3 5 7 11 17 19 23 29 31 37 41 43 47 53)
+#for i in "${dataset[@]}"
+#do
+#  for j in "${clusters[@]}"
+#  do
+#     if [  $j -lt 10 ]
+#     then
+#        $nvg_bin_prefix/nvgraph_benchmark --modularity "$i" $j $j --double --repeats 4
+#     else
+#        $nvg_bin_prefix/nvgraph_benchmark --modularity "$i" $j 7 --double --repeats 4
+#     fi
+#  done
+#  echo
+#done
+#echo
+
+#Spreadsheet 3 (same as 1 in single precision)
+#declare -ia clusters=(2 3 5 7 11 17 19 23 29 31 37 41 43 47 53)
+#for i in "${dataset[@]}"
+#do
+#  for j in "${clusters[@]}"
+#  do
+#     if [  $j -lt 10 ]
+#     then
+#        $nvg_bin_prefix/nvgraph_benchmark --modularity "$i" $j $j --foat --repeats 4
+#     else
+#        $nvg_bin_prefix/nvgraph_benchmark --modularity "$i" $j 7 --foat --repeats 4
+#     fi
+#  done
+#  echo
+#done
+
+#run only best case according to Spreadsheet 1
+#$nvg_bin_prefix/nvgraph_benchmark --modularity "$nvg_data_prefix/preferentialAttachment.mtx" 7 7 --double --repeats 4
+#$nvg_bin_prefix/nvgraph_benchmark --modularity "$nvg_data_prefix/caidaRouterLevel.mtx" 11 7 --double --repeats 4
+#$nvg_bin_prefix/nvgraph_benchmark --modularity "$nvg_data_prefix/coAuthorsDBLP.mtx" 7 7 --double --repeats 4
+#$nvg_bin_prefix/nvgraph_benchmark --modularity "$nvg_data_prefix/citationCiteseer.mtx" 17 7 --double --repeats 4
+#$nvg_bin_prefix/nvgraph_benchmark --modularity "$nvg_data_prefix/coPapersDBLP.mtx" 73 7 --double --repeats 4
+#$nvg_bin_prefix/nvgraph_benchmark --modularity "$nvg_data_prefix/coPapersCiteseer.mtx" 53 7 --double --repeats 4
+#$nvg_bin_prefix/nvgraph_benchmark --modularity "/home/afender/modularity/as-Skitter.mtx" 7 7 --double --repeats 4
+#$nvg_bin_prefix/nvgraph_benchmark --modularity "/home/afender/modularity/hollywood-2009.mtx" 11 7 --double --repeats 4
+
+#Variation of the number of  clusters  and number of eigenpairs, independently on synthetic matrix
+#for (( i = 2; i <= 8; i++ )) 
+#do
+#   for (( j = $i ; j <= 32; j++ ))
+#   do
+#         $nvg_bin_prefix/nvgraph_benchmark --modularity "/home/afender/modularity/karate_5_block_dia.mtx" $j $i --double --repeats 3
+#   done
+#   echo
+#done
+#echo 
+
+#profiles
+#nvprof --profile-from-start off --export-profile coPapersDBLP.mtx_23clusters_3ev_32b.bin /home/afender/modularity/sw/gpgpu/bin/x86_64_Linux_release/nvgraph_benchmark --modularity "/home/mnaumov/cuda_matrices/p4matrices/dimacs10/coPapersDBLP.mtx" 23 3 --double --repeats 3
+# /home/mnaumov/cuda_toolkit/cuda-linux64-mixed-rel-nightly/bin/nvprof --profile-from-start off --export-profile eigensolver_coPapersDBLP.mtx_4clusters_4ev_32b.bin /home/afender/modularity/sw/gpgpu/bin/x86_64_Linux_release/nvgraph_benchmark --modularity "/home/mnaumov/cuda_matrices/p4matrices/dimacs10/coPapersDBLP.mtx" 4 4 --double --repeats 1
+# /home/mnaumov/cuda_toolkit/cuda-linux64-mixed-rel-nightly/bin/nvprof --profile-from-start off --export-profile total_coPapersDBLP.mtx_4clusters_4ev_32b.bin /home/afender/modularity/sw/gpgpu/bin/x86_64_Linux_release/nvgraph_benchmark --modularity "/home/mnaumov/cuda_matrices/p4matrices/dimacs10/coPapersDBLP.mtx" 4 4 --double --repeats 1
+
+#small matrices
+#declare -a dataset_small=(
+#"$nvg_data_prefix/karate.mtx"
+#"$nvg_data_prefix/dolphins.mtx"
+##"$nvg_data_prefix/chesapeake.mtx"
+#"$nvg_data_prefix/lesmis.mtx"
+#"$nvg_data_prefix/adjnoun.mtx"
+#"$nvg_data_prefix/polbooks.mtx"
+#"$nvg_data_prefix/football.mtx"
+#"$nvg_data_prefix/celegansneural.mtx"
+##"$nvg_data_prefix/jazz.mtx"
+#"$nvg_data_prefix/netscience.mtx"
+##"$nvg_data_prefix/email.mtx"
+#"$nvg_data_prefix/power.mtx"
+#"$nvg_data_prefix/hep-th.mtx"
+#"$nvg_data_prefix/polblogs.mtx"
+##"$nvg_data_prefix/PGPgiantcompo.mtx"
+#"$nvg_data_prefix/cond-mat.mtx"
+#"$nvg_data_prefix/as-22july06.mtx"
+#"$nvg_data_prefix/cond-mat-2003.mtx"
+#"$nvg_data_prefix/astro-ph.mtx"
+#)
+#declare -ia clusters=(2 3 5 7 11 17 19 23 29 31)
+#for i in "${dataset_small[@]}"
+#do
+#  for j in "${clusters[@]}"
+#  do
+#     if [  $j -lt 10 ]
+#     then
+#        $nvg_bin_prefix/nvgraph_benchmark --modularity "$i" $j $j --double --repeats 4
+#     else
+#        $nvg_bin_prefix/nvgraph_benchmark --modularity "$i" $j 7 --double --repeats 4
+#     fi
+#  done
+#  echo
+#done
+#echo
diff --git a/cpp/nvgraph/cpp/tests/benchmarkScripts/run_galois.sh b/cpp/nvgraph/cpp/tests/benchmarkScripts/run_galois.sh
new file mode 100644
index 00000000000..f38ef75673a
--- /dev/null
+++ b/cpp/nvgraph/cpp/tests/benchmarkScripts/run_galois.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+# ****************** Edit this *************************
+# Path to local workspace containing p4matrices:2024 sync //matrices/p4matrices/graphs/...
+nvg_data_prefix="/home/afender/src/matrices/p4matrices/graphs"
+
+#Path to galois 
+galois_root="/home/afender/soft/galois-2.3.0/build/default"
+# *****************************************************
+export OMP_NUM_THREADS=24
+
+declare -a arr=(
+ #Small mtx just for debug
+ #"$nvg_data_prefix/small/small.mtx"
+ "$nvg_data_prefix/soc-liveJournal/soc-LiveJournal1.mtx"
+ "$nvg_data_prefix/Twitter/twitter.mtx"
+)
+
+## now loop through the above array
+for i in "${arr[@]}"
+do
+   echo "Pagerank"
+   echo "$i" 
+   time $galois_root/tools/graph-convert/graph-convert -mtx2gr -edgeType=float32 -print-all-options $i $i.galois
+   time $galois_root/tools/graph-convert/graph-convert -gr2tgr -edgeType=float32 -print-all-options $i.galois $i_T.galois
+   time $galois_root/apps/pagerank/app-pagerank $i.galois -graphTranspose="$i_T.galois" -t=$OMP_NUM_THREADS
+   echo 
+done
+echo 
+for i in "${arr[@]}"
+do
+   echo "SSSP"
+   echo "$i" 
+   time $galois_root/apps/sssp/app-sssp $i.galois -startNode=0 -t=$OMP_NUM_THREADS
+   echo 
+done
+echo 
diff --git a/cpp/nvgraph/cpp/tests/benchmarkScripts/run_graphMat.sh b/cpp/nvgraph/cpp/tests/benchmarkScripts/run_graphMat.sh
new file mode 100644
index 00000000000..90157999d85
--- /dev/null
+++ b/cpp/nvgraph/cpp/tests/benchmarkScripts/run_graphMat.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# ****************** Edit this *************************
+#*******************************************************
+#Path to graphMat binary data
+gm_data_prefix="/home-2/afender/GraphMat-master/data"
+#Path to graphMat binary 
+gm_bin_prefix="/home-2/afender/GraphMat-master/bin"
+#Number of core to use in graphMat
+export OMP_NUM_THREADS=24
+# ******************************************************
+#*******************************************************
+#  NOTE 
+#twitter_graphMat.bin and live_journal_graphMat.bin are assumed to be in "gm_data_prefix" directory
+#*******************************************************
+
+# Requiered export according to the doc
+export KMP_AFFINITY=scatter
+
+#Pagerank runs
+numactl -i all $gm_bin_prefix/PageRank $gm_data_prefix/twitter.graphmat.bin
+numactl -i all $gm_bin_prefix/PageRank $gm_data_prefix/soc-LiveJournal1.graphmat.bin
+
+# SSSP runs 
+# Warning: vertices seems to have 1-based indices (nvGraph use 0-base)
+numactl -i all $gm_bin_prefix/SSSP $gm_data_prefix/twitter.graphmat.bin 1
+numactl -i all $gm_bin_prefix/SSSP $gm_data_prefix/soc-LiveJournal1.graphmat.bin 1
\ No newline at end of file
diff --git a/cpp/nvgraph/cpp/tests/benchmarkScripts/run_nvgraph.sh b/cpp/nvgraph/cpp/tests/benchmarkScripts/run_nvgraph.sh
new file mode 100644
index 00000000000..887def03fb2
--- /dev/null
+++ b/cpp/nvgraph/cpp/tests/benchmarkScripts/run_nvgraph.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+# ****************** Edit this *************************
+#Path to nvgraph bin graphs
+# From p4matrices:2024 sync //matrices/p4matrices/graphs/...
+nvg_data_prefix="/home/afender/src/matrices/p4matrices/graphs"
+
+#Path to nvgraph 
+# nvg_bin_prefix should contain a release build of nvgraph's ToT (from p4sw //sw/gpgpu/nvgraph/...)
+# and nvgraph_benchmark executable which is build along with nvgraph's tests
+nvg_bin_prefix="/home/afender/src/sw/sw/gpgpu/bin/x86_64_Linux_release"
+# *****************************************************
+
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$nvg_bin_prefix
+export PATH=$PATH:$nvg_bin_prefix
+
+declare -a arr=(
+ "$nvg_data_prefix/webbase1M/webbase-1M_T.mtx.bin"
+ "$nvg_data_prefix/liveJournal/ljournal-2008_T.mtx.bin" 
+ "$nvg_data_prefix/webGoogle/web-Google_T.mtx.bin"
+ "$nvg_data_prefix/citPatents/cit-Patents_T.mtx.bin"
+ "$nvg_data_prefix/webBerkStan/web-BerkStan_T.mtx.bin"
+ "$nvg_data_prefix/WikiTalk/wiki-Talk_T.mtx.bin"
+ "$nvg_data_prefix/soc-liveJournal/soc-LiveJournal1_T.mtx.bin"
+ # Warning  : Twitter case works only on GPU with more than 12 GB of memory
+ "$nvg_data_prefix/Twitter/twitter.bin"
+ #Just for debug
+ #"$nvg_data_prefix/small/small.bin"
+)
+
+
+## now loop through the above array
+for i in "${arr[@]}"
+do
+   echo "Pagerank"
+   echo "$i" 
+   echo "single precision"
+   $nvg_bin_prefix/nvgraph_benchmark --pagerank "$i" 0.85 500 1E-6 --float --repeats 10
+   echo 
+   #echo "Pagerank"
+   #echo "$i" 
+   #echo "double precision"
+   #$nvg_bin_prefix/nvgraph_benchmark --pagerank "$i" 0.85 500 1E-6 --double --repeats 10
+   #echo 
+done
+echo 
+for i in "${arr[@]}"
+do
+   echo "SSSP"
+   echo "$i" 
+   echo "single precision"
+   $nvg_bin_prefix/nvgraph_benchmark --sssp "$i" 0 --float --repeats 10
+   echo 
+   #echo "SSSP"
+   #echo "$i" 
+   #echo "double precision"
+   #$nvg_bin_prefix/nvgraph_benchmark --sssp "$i" 0 --double --repeats 10
+   #echo 
+done
+echo 
+for i in "${arr[@]}"
+do
+   echo "Widest Path"
+   echo "$i" 
+   echo "single precision"
+   $nvg_bin_prefix/nvgraph_benchmark --widest "$i" 0 --float --repeats 10
+   echo 
+   #echo "Widest Path"
+   #echo "$i" 
+   #echo "double precision"
+   #$nvg_bin_prefix/nvgraph_benchmark --widest "$i" 0 --double --repeats 10
+   #echo 
+done
+echo 
diff --git a/cpp/nvgraph/cpp/tests/convert_preset_testcases.h b/cpp/nvgraph/cpp/tests/convert_preset_testcases.h
new file mode 100644
index 00000000000..2f7ac1b6760
--- /dev/null
+++ b/cpp/nvgraph/cpp/tests/convert_preset_testcases.h
@@ -0,0 +1,125 @@
+#include <nvgraph.h>
+
+
+// //                                  Simple Conversion Matrices (1)
+// //-------------------------------------------------------------------------------------
+// // Matrix A
+// // 0.0  0.0  0.2  0.0  1.0
+// // 0.3  0.7  0.0  1.2  0.0
+// // 0.0  0.0  0.0  0.0  0.0
+// // 0.0  0.0  8.6  0.0  0.0
+// // 0.0  0.0  0.0  0.0  0.986410984960948401569841
+// //
+// // n            = 5;
+// // m            = 5;
+// // nnz          = 7;
+// // csrVal = {0.2, 1.0, 0.3, 0.7, 1.2, 8.6, 0.986410984960948401569841};
+// // csrColInd = {2, 4, 0, 1, 3, 2, 4};
+// // csrRowPtr = {0, 2, 5, 5, 6, 7};
+// //
+// // cscVal = {0.3, 0.7, 0.2, 8.6, 1.2, 1.0, 0.986410984960948401569841};
+// // cscRowInc = {1, 1, 0, 3, 1, 0, 4};
+// // cscColPtr = {0, 1, 2, 4, 5, 7};
+// //
+// // COOSourceVal = {0.2, 1.0, 0.3, 0.7, 1.2, 8.6, 0.986410984960948401569841};
+// // COOSourceRowInc = {0, 0, 1, 1, 1, 3, 4};
+// // COOSourceColInc = {2, 4, 0, 1, 3, 2, 4};
+// //
+// // COODestVal = {0.3, 0.7, 0.2, 8.6, 1.2, 1.0, 0.986410984960948401569841};
+// // COODestRowInc = {1, 1, 0, 3, 1, 0, 4};
+// // COODestColInc = {0, 1, 2, 2, 3, 4, 4};
+// //-------------------------------------------------------------------------------------
+#define SIMPLE_TEST_1_N 5
+#define SIMPLE_TEST_1_NNZ 7
+
+int SIMPLE_CSR_SOURCE_OFFSETS[SIMPLE_TEST_1_N+1]      = {0, 2, 5, 5, 6, 7}; // rowPtr
+int SIMPLE_CSR_DESTINATION_INDICES[SIMPLE_TEST_1_NNZ] = {2, 4, 0 ,1 ,3 ,2 ,4}; // colInd
+
+int SIMPLE_CSC_SOURCE_INDICES[SIMPLE_TEST_1_NNZ]      = {1, 1, 0, 3, 1, 0, 4}; // rowInc
+int SIMPLE_CSC_DESTINATION_OFFSETS[SIMPLE_TEST_1_N+1] = {0, 1, 2, 4, 5, 7}; // colPtr
+
+int SIMPLE_COOS_SOURCE_INDICES[SIMPLE_TEST_1_NNZ]      = {0, 0, 1, 1, 1, 3, 4}; // row
+int SIMPLE_COOS_DESTINATION_INDICES[SIMPLE_TEST_1_NNZ] = {2, 4, 0, 1, 3, 2, 4}; // col
+
+int SIMPLE_COOD_SOURCE_INDICES[SIMPLE_TEST_1_NNZ]      = {1, 1, 0, 3, 1, 0, 4}; // row
+int SIMPLE_COOD_DESTINATION_INDICES[SIMPLE_TEST_1_NNZ] = {0, 1, 2, 2, 3, 4, 4}; //col
+
+int SIMPLE_COOU_SOURCE_INDICES[SIMPLE_TEST_1_NNZ]      = {4, 1, 0, 3, 0, 1, 1}; // row
+int SIMPLE_COOU_DESTINATION_INDICES[SIMPLE_TEST_1_NNZ] = {4, 1, 2, 2, 4, 3, 0}; //col
+
+const double SIMPLE_CSR_EDGE_DATA[SIMPLE_TEST_1_NNZ]  = {0.2, 1.0, 0.3, 0.7, 1.2, 8.6, 0.986410984960948401569841};
+const double SIMPLE_CSC_EDGE_DATA[SIMPLE_TEST_1_NNZ]  = {0.3, 0.7, 0.2, 8.6, 1.2, 1.0, 0.986410984960948401569841};
+
+const double SIMPLE_COOS_EDGE_DATA[SIMPLE_TEST_1_NNZ]  = {0.2, 1.0, 0.3, 0.7, 1.2, 8.6, 0.986410984960948401569841};
+const double SIMPLE_COOD_EDGE_DATA[SIMPLE_TEST_1_NNZ]  = {0.3, 0.7, 0.2, 8.6, 1.2, 1.0, 0.986410984960948401569841};
+const double SIMPLE_COOU_EDGE_DATA[SIMPLE_TEST_1_NNZ]  = {0.986410984960948401569841, 0.7, 0.2, 8.6, 1.0, 1.2, 0.3};
+
+
+nvgraphCSRTopology32I_st simpleCsrTopo = {
+    SIMPLE_TEST_1_N,
+    SIMPLE_TEST_1_NNZ,
+    SIMPLE_CSR_SOURCE_OFFSETS,
+    SIMPLE_CSR_DESTINATION_INDICES
+};
+nvgraphCSCTopology32I_st simpleCscTopo = {
+    SIMPLE_TEST_1_N,
+    SIMPLE_TEST_1_NNZ,
+    SIMPLE_CSC_DESTINATION_OFFSETS,
+    SIMPLE_CSC_SOURCE_INDICES
+};
+nvgraphCOOTopology32I_st simpleCooSourceTopo = {
+    SIMPLE_TEST_1_N,
+    SIMPLE_TEST_1_NNZ,
+    SIMPLE_COOS_SOURCE_INDICES,
+    SIMPLE_COOS_DESTINATION_INDICES,
+    NVGRAPH_SORTED_BY_SOURCE
+};
+nvgraphCOOTopology32I_st simpleCooDestTopo = {
+    SIMPLE_TEST_1_N,
+    SIMPLE_TEST_1_NNZ,
+    SIMPLE_COOD_SOURCE_INDICES,
+    SIMPLE_COOD_DESTINATION_INDICES,
+    NVGRAPH_SORTED_BY_DESTINATION
+};
+nvgraphCOOTopology32I_st simpleCooUnsortedTopo = {
+    SIMPLE_TEST_1_N,
+    SIMPLE_TEST_1_NNZ,
+    SIMPLE_COOU_SOURCE_INDICES,
+    SIMPLE_COOU_DESTINATION_INDICES,
+    NVGRAPH_UNSORTED
+};
+
+// //-------------------------------------------------------------------------------------
+
+struct presetTestContainer_st{
+    nvgraphCSRTopology32I_st* csrTopo;
+    nvgraphCSCTopology32I_st* cscTopo;
+    nvgraphCOOTopology32I_st*  coosTopo; // source
+    nvgraphCOOTopology32I_st*  coodTopo; // dest
+    nvgraphCOOTopology32I_st*  coouTopo; // unsorted
+    const void* csrEdgeData;
+    const void* cscEdgeData;
+    const void* coosEdgeData;
+    const void* coodEdgeData;
+    const void* coouEdgeData;
+};
+typedef struct presetTestContainer_st *presetTestContainer_t;
+
+
+// Hold all test data in one container
+presetTestContainer_st simpleTest1 = {
+    &simpleCsrTopo,
+    &simpleCscTopo,
+    &simpleCooSourceTopo,
+    &simpleCooDestTopo,
+    &simpleCooUnsortedTopo,
+    SIMPLE_CSR_EDGE_DATA,
+    SIMPLE_CSC_EDGE_DATA,
+    SIMPLE_COOS_EDGE_DATA,
+    SIMPLE_COOD_EDGE_DATA,
+    SIMPLE_COOU_EDGE_DATA
+};
+
+//-------------------------------------------------------------------------------------
+// Add your preset tests here
+presetTestContainer_st presetTests[] = {simpleTest1};
diff --git a/cpp/nvgraph/cpp/tests/mm.hxx b/cpp/nvgraph/cpp/tests/mm.hxx
new file mode 100644
index 00000000000..1d6b543ef3f
--- /dev/null
+++ b/cpp/nvgraph/cpp/tests/mm.hxx
@@ -0,0 +1,352 @@
+#pragma once
+
+#include <stdio.h>
+extern "C" {
+#include "mmio.h"
+}
+#include <thrust/sort.h>
+
+/// Read matrix properties from Matrix Market file
+/** Matrix Market file is assumed to be a sparse matrix in coordinate
+ *  format.
+ *
+ *  @param f File stream for Matrix Market file.
+ *  @param tg Boolean indicating whether to convert matrix to general
+ *  format (from symmetric, Hermitian, or skew symmetric format).
+ *  @param t (Output) MM_typecode with matrix properties.
+ *  @param m (Output) Number of matrix rows.
+ *  @param n (Output) Number of matrix columns.
+ *  @param nnz (Output) Number of non-zero matrix entries.
+ *  @return Zero if properties were read successfully. Otherwise
+ *  non-zero.
+ */
+template <typename IndexType_>
+int mm_properties(FILE * f, int tg, MM_typecode * t,
+		  IndexType_ * m, IndexType_ * n,
+		  IndexType_ * nnz) {
+
+  // Read matrix properties from file
+  int mint, nint, nnzint;
+  if(fseek(f,0,SEEK_SET)) {
+    fprintf(stderr, "Error: could not set position in file\n");
+    return -1;
+  }
+  if(mm_read_banner(f,t)) {
+    fprintf(stderr, "Error: could not read Matrix Market file banner\n");
+    return -1;
+  }
+  if(!mm_is_matrix(*t) || !mm_is_coordinate(*t)) {
+    fprintf(stderr, "Error: file does not contain matrix in coordinate format\n");
+    return -1;
+  }
+  if(mm_read_mtx_crd_size(f,&mint,&nint,&nnzint)) {
+    fprintf(stderr, "Error: could not read matrix dimensions\n");
+    return -1;
+  }
+  if(!mm_is_pattern(*t) && !mm_is_real(*t) &&
+     !mm_is_integer(*t) && !mm_is_complex(*t)) {
+    fprintf(stderr, "Error: matrix entries are not valid type\n");
+    return -1;
+  }
+  *m   = mint;
+  *n   = nint;
+  *nnz = nnzint;
+
+  // Find total number of non-zero entries
+  if(tg && !mm_is_general(*t)) {
+
+    // Non-diagonal entries should be counted twice
+    IndexType_ nnzOld = *nnz;
+    *nnz *= 2;
+
+    // Diagonal entries should not be double-counted
+    int i; int st;
+    for(i=0; i<nnzOld; ++i) {
+
+      // Read matrix entry
+      IndexType_ row, col;
+      double rval, ival;
+      if (mm_is_pattern(*t)) 
+          st = fscanf(f, "%d %d\n", &row, &col);
+      else if (mm_is_real(*t) || mm_is_integer(*t))
+          st = fscanf(f, "%d %d %lg\n", &row, &col, &rval);
+      else // Complex matrix
+          st = fscanf(f, "%d %d %lg %lg\n", &row, &col, &rval, &ival);
+      if(ferror(f) || (st == EOF)) {
+          fprintf(stderr, "Error: error %d reading Matrix Market file (entry %d)\n", st, i+1);
+          return -1;
+      }
+
+      // Check if entry is diagonal
+      if(row == col)
+	--(*nnz);
+
+    }
+  }
+
+  return 0;
+
+}
+
+/// Read Matrix Market file and convert to COO format matrix
+/** Matrix Market file is assumed to be a sparse matrix in coordinate
+ *  format.
+ *
+ *  @param f File stream for Matrix Market file.
+ *  @param tg Boolean indicating whether to convert matrix to general
+ *  format (from symmetric, Hermitian, or skew symmetric format).
+ *  @param nnz Number of non-zero matrix entries.
+ *  @param cooRowInd (Output) Row indices for COO matrix. Should have
+ *  at least nnz entries.
+ *  @param cooColInd (Output) Column indices for COO matrix. Should
+ *  have at least nnz entries.
+ *  @param cooRVal (Output) Real component of COO matrix
+ *  entries. Should have at least nnz entries. Ignored if null
+ *  pointer.
+ *  @param cooIVal (Output) Imaginary component of COO matrix
+ *  entries. Should have at least nnz entries. Ignored if null
+ *  pointer.
+ *  @return Zero if matrix was read successfully. Otherwise non-zero.
+ */
+template <typename IndexType_, typename ValueType_>
+int mm_to_coo(FILE *f, int tg, IndexType_ nnz,
+	      IndexType_ * cooRowInd, IndexType_ * cooColInd, 
+	      ValueType_ * cooRVal  , ValueType_ * cooIVal) {
+  
+  // Read matrix properties from file
+  MM_typecode t;
+  int m, n, nnzOld;
+  if(fseek(f,0,SEEK_SET)) {
+    fprintf(stderr, "Error: could not set position in file\n");
+    return -1;
+  }
+  if(mm_read_banner(f,&t)) {
+    fprintf(stderr, "Error: could not read Matrix Market file banner\n");
+    return -1;
+  }
+  if(!mm_is_matrix(t) || !mm_is_coordinate(t)) {
+    fprintf(stderr, "Error: file does not contain matrix in coordinate format\n");
+    return -1;
+  }
+  if(mm_read_mtx_crd_size(f,&m,&n,&nnzOld)) {
+    fprintf(stderr, "Error: could not read matrix dimensions\n");
+    return -1;
+  }
+  if(!mm_is_pattern(t) && !mm_is_real(t) &&
+     !mm_is_integer(t) && !mm_is_complex(t)) {
+    fprintf(stderr, "Error: matrix entries are not valid type\n");
+    return -1;
+  }
+
+  // Add each matrix entry in file to COO format matrix
+  IndexType_ i;      // Entry index in Matrix Market file
+  IndexType_ j = 0;  // Entry index in COO format matrix
+  for(i=0;i<nnzOld;++i) {
+
+    // Read entry from file
+    int row, col;
+    double rval, ival;
+    int st;
+    if (mm_is_pattern(t)) {
+      st = fscanf(f, "%d %d\n", &row, &col);
+      rval = 1.0;
+      ival = 0.0;
+    }
+    else if (mm_is_real(t) || mm_is_integer(t)) {
+      st = fscanf(f, "%d %d %lg\n", &row, &col, &rval);
+      ival = 0.0;
+    }
+    else // Complex matrix
+      st = fscanf(f, "%d %d %lg %lg\n", &row, &col, &rval, &ival);
+    if(ferror(f) || (st == EOF)) {
+        fprintf(stderr, "Error: error %d reading Matrix Market file (entry %d)\n", st, i+1);
+      return -1;
+    }
+
+    // Switch to 0-based indexing
+    --row;
+    --col;
+
+    // Record entry
+    cooRowInd[j] = row;
+    cooColInd[j] = col;
+    if(cooRVal != NULL)
+      cooRVal[j] = rval;
+    if(cooIVal != NULL)
+      cooIVal[j] = ival;
+    ++j;
+
+    // Add symmetric complement of non-diagonal entries
+    if(tg && !mm_is_general(t) && (row!=col)) {
+
+      // Modify entry value if matrix is skew symmetric or Hermitian
+      if(mm_is_skew(t)) {
+	rval = -rval;
+	ival = -ival;
+      }
+      else if(mm_is_hermitian(t)) {
+	ival = -ival;
+      }
+
+      // Record entry
+      cooRowInd[j] = col;
+      cooColInd[j] = row;
+      if(cooRVal != NULL)
+	cooRVal[j] = rval;
+      if(cooIVal != NULL)
+	cooIVal[j] = ival;
+      ++j;
+      
+    }
+  }
+  return 0;
+
+}
+
+/// Compare two tuples based on the element indexed by i
+class lesser_tuple {
+  const int i;
+public:
+  lesser_tuple(int _i) : i(_i) {}
+  template<typename Tuple1, typename Tuple2>
+  __host__ __device__
+  bool operator()(const Tuple1 t1, const Tuple2 t2) {
+    switch(i) {
+    case 0:  return (thrust::get<0>(t1) < thrust::get<0>(t2));
+    case 1:  return (thrust::get<1>(t1) < thrust::get<1>(t2));
+    default: return (thrust::get<0>(t1) < thrust::get<0>(t2));
+    }
+    
+  }
+};
+
+/// Sort entries in COO format matrix
+/** Sort is stable.
+ *
+ *  @param nnz Number of non-zero matrix entries.
+ *  @param sort_by_row Boolean indicating whether matrix entries
+ *  will be sorted by row index or by column index.
+ *  @param cooRowInd Row indices for COO matrix.
+ *  @param cooColInd Column indices for COO matrix.
+ *  @param cooRVal Real component for COO matrix entries. Ignored if
+ *  null pointer.
+ *  @param cooIVal Imaginary component COO matrix entries. Ignored if
+ *  null pointer.
+ */
+template <typename IndexType_, typename ValueType_>
+void coo_sort(IndexType_ nnz, int sort_by_row,
+	      IndexType_ * cooRowInd,
+	      IndexType_ * cooColInd, 
+	      ValueType_ * cooRVal,
+	      ValueType_ * cooIVal) {
+
+  // Determine whether to sort by row or by column
+  int i;
+  if(sort_by_row == 0)
+    i = 1;
+  else
+    i = 0;
+
+  // Apply stable sort
+  using namespace thrust;
+  if((cooRVal==NULL) && (cooIVal==NULL))
+    stable_sort(make_zip_iterator(make_tuple(cooRowInd,cooColInd)),
+		make_zip_iterator(make_tuple(cooRowInd+nnz,cooColInd+nnz)),
+		lesser_tuple(i));
+  else if((cooRVal==NULL) && (cooIVal!=NULL))
+    stable_sort(make_zip_iterator(make_tuple(cooRowInd,cooColInd,cooIVal)),
+		make_zip_iterator(make_tuple(cooRowInd+nnz,cooColInd+nnz,cooIVal+nnz)),
+		lesser_tuple(i));
+  else if((cooRVal!=NULL) && (cooIVal==NULL))
+    stable_sort(make_zip_iterator(make_tuple(cooRowInd,cooColInd,cooRVal)),
+		make_zip_iterator(make_tuple(cooRowInd+nnz,cooColInd+nnz,cooRVal+nnz)),
+		lesser_tuple(i));
+  else
+    stable_sort(make_zip_iterator(make_tuple(cooRowInd,cooColInd,cooRVal,cooIVal)),
+		make_zip_iterator(make_tuple(cooRowInd+nnz,cooColInd+nnz,
+					     cooRVal+nnz,cooIVal+nnz)),
+		lesser_tuple(i));
+}
+
+/// Compress sorted list of indices
+/** For use in converting COO format matrix to CSR or CSC format.
+ *
+ *  @param n Maximum index.
+ *  @param nnz Number of non-zero matrix entries.
+ *  @param sortedIndices Sorted list of indices (COO format).
+ *  @param compressedIndices (Output) Compressed list of indices (CSR
+ *  or CSC format). Should have at least n+1 entries.
+ */
+template <typename IndexType_>
+void coo_compress(IndexType_ n, IndexType_ nnz,
+		  const IndexType_ * __restrict__ sortedIndices,
+		  IndexType_ * __restrict__ compressedIndices) {
+  IndexType_ i;
+
+  // Initialize everything to zero
+  memset(compressedIndices, 0, (n+1)*sizeof(IndexType_));
+  
+  // Count number of elements per row
+  for(i=0; i<nnz; ++i)
+    ++(compressedIndices[sortedIndices[i]+1]);
+  
+  // Compute cumulative sum to obtain row offsets/pointers
+  for(i=0; i<n; ++i)
+    compressedIndices[i+1] += compressedIndices[i];
+
+}
+
+/// Convert COO format matrix to CSR format
+/** On output, matrix entries in COO format matrix will be sorted
+ *  (primarily by row index, secondarily by column index).
+ *
+ *  @param m Number of matrix rows.
+ *  @param n Number of matrix columns.
+ *  @param nnz Number of non-zero matrix entries.
+ *  @param cooRowInd Row indices for COO matrix.
+ *  @param cooColInd Column indices for COO matrix.
+ *  @param cooRVal Real component of COO matrix entries. Ignored if
+ *  null pointer.
+ *  @param cooIVal Imaginary component of COO matrix entries. Ignored
+ *  if null pointer.
+ *  @param csrRowPtr Row pointers for CSR matrix. Should have at least
+ *  n+1 entries.
+ *  @param csrColInd Column indices for CSR matrix (identical to
+ *  output of cooColInd). Should have at least nnz entries. Ignored if
+ *  null pointer.
+ *  @param csrRVal Real component of CSR matrix entries (identical to
+ *  output of cooRVal). Should have at least nnz entries.  Ignored if
+ *  null pointer.
+ *  @param csrIVal Imaginary component of CSR matrix entries
+ *  (identical to output of cooIVal). Should have at least nnz
+ *  entries.  Ignored if null pointer.
+ *  @return Zero if matrix was converted successfully. Otherwise
+ *  non-zero.
+ */
+template <typename IndexType_, typename ValueType_>
+int coo_to_csr(IndexType_ m, IndexType_ n, IndexType_ nnz,
+		IndexType_ * __restrict__ cooRowInd,
+		IndexType_ * __restrict__ cooColInd, 
+		ValueType_ * __restrict__ cooRVal,
+		ValueType_ * __restrict__ cooIVal,
+		IndexType_ * __restrict__ csrRowPtr,
+		IndexType_ * __restrict__ csrColInd,
+		ValueType_ * __restrict__ csrRVal,
+		ValueType_ * __restrict__ csrIVal) {
+
+  // Convert COO to CSR matrix
+  coo_sort(nnz, 0, cooRowInd, cooColInd, cooRVal, cooIVal);
+  coo_sort(nnz, 1, cooRowInd, cooColInd, cooRVal, cooIVal);
+  coo_compress(n, nnz, cooRowInd, csrRowPtr);
+
+  // Copy arrays
+  if(csrColInd!=NULL)
+    memcpy(csrColInd, cooColInd, nnz*sizeof(IndexType_));
+  if((cooRVal!=NULL) && (csrRVal!=NULL))
+    memcpy(csrRVal, cooRVal, nnz*sizeof(ValueType_));
+  if((cooIVal!=NULL) && (csrIVal!=NULL))
+    memcpy(csrIVal, cooIVal, nnz*sizeof(ValueType_));
+
+  return 0;
+
+}
diff --git a/cpp/nvgraph/cpp/tests/mmio.c b/cpp/nvgraph/cpp/tests/mmio.c
new file mode 100644
index 00000000000..32f76c96b07
--- /dev/null
+++ b/cpp/nvgraph/cpp/tests/mmio.c
@@ -0,0 +1,511 @@
+/* 
+*   Matrix Market I/O library for ANSI C
+*
+*   See http://math.nist.gov/MatrixMarket for details.
+*
+*
+*/
+
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <ctype.h>
+
+#include "mmio.h"
+
+int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
+                double **val_, int **I_, int **J_)
+{
+    FILE *f;
+    MM_typecode matcode;
+    int M, N, nz;
+    int i;
+    double *val;
+    int *I, *J;
+ 
+    if ((f = fopen(fname, "r")) == NULL)
+            return -1;
+ 
+ 
+    if (mm_read_banner(f, &matcode) != 0)
+    {
+        printf("mm_read_unsymetric: Could not process Matrix Market banner ");
+        printf(" in file [%s]\n", fname);
+        return -1;
+    }
+ 
+ 
+ 
+    if ( !(mm_is_real(matcode) && mm_is_matrix(matcode) &&
+            mm_is_sparse(matcode)))
+    {
+        fprintf(stderr, "Sorry, this application does not support ");
+        fprintf(stderr, "Market Market type: [%s]\n",
+                mm_typecode_to_str(matcode));
+        return -1;
+    }
+ 
+    /* find out size of sparse matrix: M, N, nz .... */
+ 
+    if (mm_read_mtx_crd_size(f, &M, &N, &nz) !=0)
+    {
+        fprintf(stderr, "read_unsymmetric_sparse(): could not parse matrix size.\n");
+        return -1;
+    }
+ 
+    *M_ = M;
+    *N_ = N;
+    *nz_ = nz;
+ 
+    /* reseve memory for matrices */
+ 
+    I = (int *) malloc(nz * sizeof(int));
+    J = (int *) malloc(nz * sizeof(int));
+    val = (double *) malloc(nz * sizeof(double));
+ 
+    *val_ = val;
+    *I_ = I;
+    *J_ = J;
+ 
+    /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
+    /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
+    /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
+ 
+    for (i=0; i<nz; i++)
+    {
+        fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i]);
+        I[i]--;  /* adjust from 1-based to 0-based */
+        J[i]--;
+    }
+    fclose(f);
+ 
+    return 0;
+}
+
+int mm_is_valid(MM_typecode matcode)
+{
+    if (!mm_is_matrix(matcode)) return 0;
+    if (mm_is_dense(matcode) && mm_is_pattern(matcode)) return 0;
+    if (mm_is_real(matcode) && mm_is_hermitian(matcode)) return 0;
+    if (mm_is_pattern(matcode) && (mm_is_hermitian(matcode) || 
+                mm_is_skew(matcode))) return 0;
+    return 1;
+}
+
+int mm_read_banner(FILE *f, MM_typecode *matcode)
+{
+    char line[MM_MAX_LINE_LENGTH];
+    char banner[MM_MAX_TOKEN_LENGTH];
+    char mtx[MM_MAX_TOKEN_LENGTH]; 
+    char crd[MM_MAX_TOKEN_LENGTH];
+    char data_type[MM_MAX_TOKEN_LENGTH];
+    char storage_scheme[MM_MAX_TOKEN_LENGTH];
+    char *p;
+
+
+    mm_clear_typecode(matcode);  
+
+    if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL) 
+        return MM_PREMATURE_EOF;
+
+    if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, data_type, 
+        storage_scheme) != 5)
+        return MM_PREMATURE_EOF;
+
+    for (p=mtx; *p!='\0'; *p=tolower(*p),p++);  /* convert to lower case */
+    for (p=crd; *p!='\0'; *p=tolower(*p),p++);  
+    for (p=data_type; *p!='\0'; *p=tolower(*p),p++);
+    for (p=storage_scheme; *p!='\0'; *p=tolower(*p),p++);
+
+    /* check for banner */
+    if (strncmp(banner, MatrixMarketBanner, strlen(MatrixMarketBanner)) != 0)
+        return MM_NO_HEADER;
+
+    /* first field should be "mtx" */
+    if (strcmp(mtx, MM_MTX_STR) != 0)
+        return  MM_UNSUPPORTED_TYPE;
+    mm_set_matrix(matcode);
+
+
+    /* second field describes whether this is a sparse matrix (in coordinate
+            storgae) or a dense array */
+
+
+    if (strcmp(crd, MM_SPARSE_STR) == 0)
+        mm_set_sparse(matcode);
+    else
+    if (strcmp(crd, MM_DENSE_STR) == 0)
+            mm_set_dense(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+    
+
+    /* third field */
+
+    if (strcmp(data_type, MM_REAL_STR) == 0)
+        mm_set_real(matcode);
+    else
+    if (strcmp(data_type, MM_COMPLEX_STR) == 0)
+        mm_set_complex(matcode);
+    else
+    if (strcmp(data_type, MM_PATTERN_STR) == 0)
+        mm_set_pattern(matcode);
+    else
+    if (strcmp(data_type, MM_INT_STR) == 0)
+        mm_set_integer(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+    
+
+    /* fourth field */
+
+    if (strcmp(storage_scheme, MM_GENERAL_STR) == 0)
+        mm_set_general(matcode);
+    else
+    if (strcmp(storage_scheme, MM_SYMM_STR) == 0)
+        mm_set_symmetric(matcode);
+    else
+    if (strcmp(storage_scheme, MM_HERM_STR) == 0)
+        mm_set_hermitian(matcode);
+    else
+    if (strcmp(storage_scheme, MM_SKEW_STR) == 0)
+        mm_set_skew(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+        
+
+    return 0;
+}
+
+int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz)
+{
+    if (fprintf(f, "%d %d %d\n", M, N, nz) != 3)
+        return MM_COULD_NOT_WRITE_FILE;
+    else 
+        return 0;
+}
+
+int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz )
+{
+    char line[MM_MAX_LINE_LENGTH];
+    int num_items_read;
+
+    /* set return null parameter values, in case we exit with errors */
+    *M = *N = *nz = 0;
+
+    /* now continue scanning until you reach the end-of-comments */
+    do 
+    {
+        if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) 
+            return MM_PREMATURE_EOF;
+    }while (line[0] == '%');
+
+    /* line[] is either blank or has M,N, nz */
+    if (sscanf(line, "%d %d %d", M, N, nz) == 3)
+        return 0;
+        
+    else
+    do
+    { 
+        num_items_read = fscanf(f, "%d %d %d", M, N, nz); 
+        if (num_items_read == EOF) return MM_PREMATURE_EOF;
+    }
+    while (num_items_read != 3);
+
+    return 0;
+}
+
+
+int mm_read_mtx_array_size(FILE *f, int *M, int *N)
+{
+    char line[MM_MAX_LINE_LENGTH];
+    int num_items_read;
+    /* set return null parameter values, in case we exit with errors */
+    *M = *N = 0;
+	
+    /* now continue scanning until you reach the end-of-comments */
+    do 
+    {
+        if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) 
+            return MM_PREMATURE_EOF;
+    }while (line[0] == '%');
+
+    /* line[] is either blank or has M,N, nz */
+    if (sscanf(line, "%d %d", M, N) == 2)
+        return 0;
+        
+    else /* we have a blank line */
+    do
+    { 
+        num_items_read = fscanf(f, "%d %d", M, N); 
+        if (num_items_read == EOF) return MM_PREMATURE_EOF;
+    }
+    while (num_items_read != 2);
+
+    return 0;
+}
+
+int mm_write_mtx_array_size(FILE *f, int M, int N)
+{
+    if (fprintf(f, "%d %d\n", M, N) != 2)
+        return MM_COULD_NOT_WRITE_FILE;
+    else 
+        return 0;
+}
+
+
+
+/*-------------------------------------------------------------------------*/
+
+/******************************************************************/
+/* use when I[], J[], and val[]J, and val[] are already allocated */
+/******************************************************************/
+
+int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
+        double val[], MM_typecode matcode)
+{
+    int i;
+    if (mm_is_complex(matcode))
+    {
+        for (i=0; i<nz; i++)
+            if (fscanf(f, "%d %d %lg %lg", &I[i], &J[i], &val[2*i], &val[2*i+1])
+                != 4) return MM_PREMATURE_EOF;
+    }
+    else if (mm_is_real(matcode))
+    {
+        for (i=0; i<nz; i++)
+        {
+            if (fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i])
+                != 3) return MM_PREMATURE_EOF;
+
+        }
+    }
+
+    else if (mm_is_pattern(matcode))
+    {
+        for (i=0; i<nz; i++)
+            if (fscanf(f, "%d %d", &I[i], &J[i])
+                != 2) return MM_PREMATURE_EOF;
+    }
+    else
+        return MM_UNSUPPORTED_TYPE;
+
+    return 0;
+        
+}
+
+int mm_read_mtx_crd_entry(FILE *f, int *I, int *J,
+        double *real, double *imag, MM_typecode matcode)
+{
+    if (mm_is_complex(matcode))
+    {
+            if (fscanf(f, "%d %d %lg %lg", I, J, real, imag)
+                != 4) return MM_PREMATURE_EOF;
+    }
+    else if (mm_is_real(matcode))
+    {
+            if (fscanf(f, "%d %d %lg\n", I, J, real)
+                != 3) return MM_PREMATURE_EOF;
+
+    }
+
+    else if (mm_is_pattern(matcode))
+    {
+            if (fscanf(f, "%d %d", I, J) != 2) return MM_PREMATURE_EOF;
+    }
+    else
+        return MM_UNSUPPORTED_TYPE;
+
+    return 0;
+        
+}
+
+
+/************************************************************************
+    mm_read_mtx_crd()  fills M, N, nz, array of values, and return
+                        type code, e.g. 'MCRS'
+
+                        if matrix is complex, values[] is of size 2*nz,
+                            (nz pairs of real/imaginary values)
+************************************************************************/
+
+int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J, 
+        double **val, MM_typecode *matcode)
+{
+    int ret_code;
+    FILE *f;
+
+    if (strcmp(fname, "stdin") == 0) f=stdin;
+    else
+    if ((f = fopen(fname, "r")) == NULL)
+        return MM_COULD_NOT_READ_FILE;
+
+
+    if ((ret_code = mm_read_banner(f, matcode)) != 0)
+        return ret_code;
+
+    if (!(mm_is_valid(*matcode) && mm_is_sparse(*matcode) && 
+            mm_is_matrix(*matcode)))
+        return MM_UNSUPPORTED_TYPE;
+
+    if ((ret_code = mm_read_mtx_crd_size(f, M, N, nz)) != 0)
+        return ret_code;
+
+
+    *I = (int *)  malloc(*nz * sizeof(int));
+    *J = (int *)  malloc(*nz * sizeof(int));
+    *val = NULL;
+
+    if (mm_is_complex(*matcode))
+    {
+        *val = (double *) malloc(*nz * 2 * sizeof(double));
+        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
+                *matcode);
+        if (ret_code != 0) return ret_code;
+    }
+    else if (mm_is_real(*matcode))
+    {
+        *val = (double *) malloc(*nz * sizeof(double));
+        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
+                *matcode);
+        if (ret_code != 0) return ret_code;
+    }
+
+    else if (mm_is_pattern(*matcode))
+    {
+        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
+                *matcode);
+        if (ret_code != 0) return ret_code;
+    }
+
+    if (f != stdin) fclose(f);
+    return 0;
+}
+
+int mm_write_banner(FILE *f, MM_typecode matcode)
+{
+    char *str = mm_typecode_to_str(matcode);
+    int ret_code;
+
+    ret_code = fprintf(f, "%s %s\n", MatrixMarketBanner, str);
+    free(str);
+    if (ret_code !=2 )
+        return MM_COULD_NOT_WRITE_FILE;
+    else
+        return 0;
+}
+
+int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
+        double val[], MM_typecode matcode)
+{
+    FILE *f;
+    int i;
+
+    if (strcmp(fname, "stdout") == 0) 
+        f = stdout;
+    else
+    if ((f = fopen(fname, "w")) == NULL)
+        return MM_COULD_NOT_WRITE_FILE;
+    
+    /* print banner followed by typecode */
+    fprintf(f, "%s ", MatrixMarketBanner);
+    fprintf(f, "%s\n", mm_typecode_to_str(matcode));
+
+    /* print matrix sizes and nonzeros */
+    fprintf(f, "%d %d %d\n", M, N, nz);
+
+    /* print values */
+    if (mm_is_pattern(matcode))
+        for (i=0; i<nz; i++)
+            fprintf(f, "%d %d\n", I[i], J[i]);
+    else
+    if (mm_is_real(matcode))
+        for (i=0; i<nz; i++)
+            fprintf(f, "%d %d %20.16g\n", I[i], J[i], val[i]);
+    else
+    if (mm_is_complex(matcode))
+        for (i=0; i<nz; i++)
+            fprintf(f, "%d %d %20.16g %20.16g\n", I[i], J[i], val[2*i], 
+                        val[2*i+1]);
+    else
+    {
+        if (f != stdout) fclose(f);
+        return MM_UNSUPPORTED_TYPE;
+    }
+
+    if (f !=stdout) fclose(f);
+
+    return 0;
+}
+  
+
+/**
+*  Create a new copy of a string s.  mm_strdup() is a common routine, but
+*  not part of ANSI C, so it is included here.  Used by mm_typecode_to_str().
+*
+*/
+char *mm_strdup(const char *s)
+{
+	int len = strlen(s);
+	char *s2 = (char *) malloc((len+1)*sizeof(char));
+	return strcpy(s2, s);
+}
+
+char  *mm_typecode_to_str(MM_typecode matcode)
+{
+    char buffer[MM_MAX_LINE_LENGTH];
+    char *types[4];
+	char *mm_strdup(const char *);
+    int error =0;
+
+    /* check for MTX type */
+    if (mm_is_matrix(matcode)) 
+        types[0] = (char*)MM_MTX_STR;
+    else
+        error=1;
+
+    /* check for CRD or ARR matrix */
+    if (mm_is_sparse(matcode))
+        types[1] = (char*)MM_SPARSE_STR;
+    else
+    if (mm_is_dense(matcode))
+        types[1] = (char*)MM_DENSE_STR;
+    else
+        return NULL;
+
+    /* check for element data type */
+    if (mm_is_real(matcode))
+        types[2] = (char*)MM_REAL_STR;
+    else
+    if (mm_is_complex(matcode))
+        types[2] = (char*)MM_COMPLEX_STR;
+    else
+    if (mm_is_pattern(matcode))
+        types[2] = (char*)MM_PATTERN_STR;
+    else
+    if (mm_is_integer(matcode))
+        types[2] = (char*)MM_INT_STR;
+    else
+        return NULL;
+
+
+    /* check for symmetry type */
+    if (mm_is_general(matcode))
+        types[3] = (char*)MM_GENERAL_STR;
+    else
+    if (mm_is_symmetric(matcode))
+        types[3] = (char*)MM_SYMM_STR;
+    else 
+    if (mm_is_hermitian(matcode))
+        types[3] = (char*)MM_HERM_STR;
+    else 
+    if (mm_is_skew(matcode))
+        types[3] = (char*)MM_SKEW_STR;
+    else
+        return NULL;
+
+    sprintf(buffer,"%s %s %s %s", types[0], types[1], types[2], types[3]);
+    return mm_strdup(buffer);
+
+}
diff --git a/cpp/nvgraph/cpp/tests/mmio.h b/cpp/nvgraph/cpp/tests/mmio.h
new file mode 100644
index 00000000000..7cfd0a1b7ae
--- /dev/null
+++ b/cpp/nvgraph/cpp/tests/mmio.h
@@ -0,0 +1,133 @@
+/* 
+*   Matrix Market I/O library for ANSI C
+*
+*   See http://math.nist.gov/MatrixMarket for details.
+*
+*
+*/
+
+#ifndef MM_IO_H
+#define MM_IO_H
+
+#define MM_MAX_LINE_LENGTH 1025
+#define MatrixMarketBanner "%%MatrixMarket"
+#define MM_MAX_TOKEN_LENGTH 64
+
+typedef char MM_typecode[4];
+
+char *mm_typecode_to_str(MM_typecode matcode);
+
+int mm_read_banner(FILE *f, MM_typecode *matcode);
+int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz);
+int mm_read_mtx_array_size(FILE *f, int *M, int *N);
+
+int mm_write_banner(FILE *f, MM_typecode matcode);
+int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz);
+int mm_write_mtx_array_size(FILE *f, int M, int N);
+
+
+/********************* MM_typecode query fucntions ***************************/
+
+#define mm_is_matrix(typecode)	((typecode)[0]=='M')
+
+#define mm_is_sparse(typecode)	((typecode)[1]=='C')
+#define mm_is_coordinate(typecode)((typecode)[1]=='C')
+#define mm_is_dense(typecode)	((typecode)[1]=='A')
+#define mm_is_array(typecode)	((typecode)[1]=='A')
+
+#define mm_is_complex(typecode)	((typecode)[2]=='C')
+#define mm_is_real(typecode)		((typecode)[2]=='R')
+#define mm_is_pattern(typecode)	((typecode)[2]=='P')
+#define mm_is_integer(typecode) ((typecode)[2]=='I')
+
+#define mm_is_symmetric(typecode)((typecode)[3]=='S')
+#define mm_is_general(typecode)	((typecode)[3]=='G')
+#define mm_is_skew(typecode)	((typecode)[3]=='K')
+#define mm_is_hermitian(typecode)((typecode)[3]=='H')
+
+int mm_is_valid(MM_typecode matcode);		/* too complex for a macro */
+
+
+/********************* MM_typecode modify fucntions ***************************/
+
+#define mm_set_matrix(typecode)	((*typecode)[0]='M')
+#define mm_set_coordinate(typecode)	((*typecode)[1]='C')
+#define mm_set_array(typecode)	((*typecode)[1]='A')
+#define mm_set_dense(typecode)	mm_set_array(typecode)
+#define mm_set_sparse(typecode)	mm_set_coordinate(typecode)
+
+#define mm_set_complex(typecode)((*typecode)[2]='C')
+#define mm_set_real(typecode)	((*typecode)[2]='R')
+#define mm_set_pattern(typecode)((*typecode)[2]='P')
+#define mm_set_integer(typecode)((*typecode)[2]='I')
+
+
+#define mm_set_symmetric(typecode)((*typecode)[3]='S')
+#define mm_set_general(typecode)((*typecode)[3]='G')
+#define mm_set_skew(typecode)	((*typecode)[3]='K')
+#define mm_set_hermitian(typecode)((*typecode)[3]='H')
+
+#define mm_clear_typecode(typecode) ((*typecode)[0]=(*typecode)[1]= \
+									(*typecode)[2]=' ',(*typecode)[3]='G')
+
+#define mm_initialize_typecode(typecode) mm_clear_typecode(typecode)
+
+
+/********************* Matrix Market error codes ***************************/
+
+
+#define MM_COULD_NOT_READ_FILE	11
+#define MM_PREMATURE_EOF		12
+#define MM_NOT_MTX				13
+#define MM_NO_HEADER			14
+#define MM_UNSUPPORTED_TYPE		15
+#define MM_LINE_TOO_LONG		16
+#define MM_COULD_NOT_WRITE_FILE	17
+
+
+/******************** Matrix Market internal definitions ********************
+
+   MM_matrix_typecode: 4-character sequence
+
+				    ojbect 		sparse/   	data        storage 
+						  		dense     	type        scheme
+
+   string position:	 [0]        [1]			[2]         [3]
+
+   Matrix typecode:  M(atrix)  C(oord)		R(eal)   	G(eneral)
+						        A(array)	C(omplex)   H(ermitian)
+											P(attern)   S(ymmetric)
+								    		I(nteger)	K(kew)
+
+ ***********************************************************************/
+
+#define MM_MTX_STR		"matrix"
+#define MM_ARRAY_STR	"array"
+#define MM_DENSE_STR	"array"
+#define MM_COORDINATE_STR "coordinate" 
+#define MM_SPARSE_STR	"coordinate"
+#define MM_COMPLEX_STR	"complex"
+#define MM_REAL_STR		"real"
+#define MM_INT_STR		"integer"
+#define MM_GENERAL_STR  "general"
+#define MM_SYMM_STR		"symmetric"
+#define MM_HERM_STR		"hermitian"
+#define MM_SKEW_STR		"skew-symmetric"
+#define MM_PATTERN_STR  "pattern"
+
+
+/*  high level routines */
+
+int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
+		 double val[], MM_typecode matcode);
+int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
+		double val[], MM_typecode matcode);
+int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img,
+			MM_typecode matcode);
+
+int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
+                double **val_, int **I_, int **J_);
+
+
+
+#endif
diff --git a/cpp/nvgraph/cpp/tests/nvgraph_benchmark.cpp b/cpp/nvgraph/cpp/tests/nvgraph_benchmark.cpp
new file mode 100644
index 00000000000..a17908fa7a7
--- /dev/null
+++ b/cpp/nvgraph/cpp/tests/nvgraph_benchmark.cpp
@@ -0,0 +1,1192 @@
+// This is gtest application that contains all of the C API tests. Parameters:
+// nvgraph_capi_tests [--perf] [--stress-iters N] [--gtest_filter=NameFilterPatter]
+// It also accepts any other gtest (1.7.0) default parameters.
+// Right now this application contains:
+// 1) Sanity Check tests - tests on simple examples with known answer (or known behaviour)
+// 2) Correctness checks tests - tests on real graph data, uses reference algorithm 
+//    (CPU code for SrSPMV and python scripts for other algorithms, see 
+//    python scripts here: //sw/gpgpu/nvgraph/test/ref/) with reference results, compares those two.
+//    It also measures performance of single algorithm C API call, enf enabled (see below)
+// 3) Corner cases tests - tests with some bad inputs, bad parameters, expects library to handle 
+//    it gracefully
+// 4) Stress tests - makes sure that library result is persistent throughout the library usage
+//    (a lot of C API calls). Also makes some assumptions and checks on memory usage during 
+//    this test.
+//
+// We can control what tests to launch by using gtest filters. For example:
+// Only sanity tests:
+//    ./nvgraph_capi_tests --gtest_filter=*Sanity*
+// And, correspondingly:
+//    ./nvgraph_capi_tests --gtest_filter=*Correctness*
+//    ./nvgraph_capi_tests --gtest_filter=*Corner*
+//    ./nvgraph_capi_tests --gtest_filter=*Stress*
+// Or, combination:
+//    ./nvgraph_capi_tests --gtest_filter=*Sanity*:*Correctness*
+//
+// Performance reports are provided in the ERIS format and disabled by default. 
+// Could be enabled by adding '--perf' to the command line. I added this parameter to vlct
+//
+// Parameter '--stress-iters N', which gives multiplier (not an absolute value) for the number of launches for stress tests
+//
+
+#include <utility>
+#include "nvgraph_test_common.h"
+#include "valued_csr_graph.hxx"
+#include "readMatrix.hxx"
+#include "nvgraphP.h"
+#include "nvgraph.h"
+#include "nvgraph_experimental.h"
+#include "stdlib.h"
+#include "stdint.h"
+#include <algorithm>
+extern "C" {
+#include "mmio.h"
+}
+#include "mm.hxx"
+// minimum vertices in the graph to perform perf measurements
+#define PERF_ROWS_LIMIT 10000
+
+// number of repeats = multiplier/num_vertices
+#define SRSPMV_ITER_MULTIPLIER   1000000000
+#define SSSP_ITER_MULTIPLIER     30000000
+#define WIDEST_ITER_MULTIPLIER   30000000
+#define PAGERANK_ITER_MULTIPLIER 300000000
+
+// utility
+
+#define NVGRAPH_SAFE_CALL(call) \
+{\
+    nvgraphStatus_t status = (call) ;\
+    if ( NVGRAPH_STATUS_SUCCESS != status )\
+    {\
+        std::cout << "Error #" << status << " in " << __FILE__ << ":" << __LINE__ << std::endl;\
+        exit(1);\
+    }\
+} 
+
+#define CUDA_SAFE_CALL(call) \
+{\
+    cudaError_t status = (call) ;\
+    if ( cudaSuccess != status )\
+    {\
+        std::cout << "Error #" << status << " in " << __FILE__ << ":" << __LINE__ << std::endl;\
+        exit(1);\
+    }\
+} 
+
+template <typename T>
+struct nvgraph_Const;
+
+template <>
+struct nvgraph_Const<double>
+{ 
+    static const cudaDataType_t Type = CUDA_R_64F;
+    static const double inf;
+    static const double tol;
+    typedef union fpint 
+    {
+        double f;
+        unsigned long u;
+    } fpint_st;
+};
+
+const double nvgraph_Const<double>::inf = DBL_MAX;
+const double nvgraph_Const<double>::tol = 1e-6; // this is what we use as a tolerance in the algorithms, more precision than this is useless for CPU reference comparison
+
+template <>
+struct nvgraph_Const<float>
+{ 
+    static const cudaDataType_t Type = CUDA_R_32F;
+    static const float inf;
+    static const float tol;
+
+    typedef union fpint 
+    {
+        float f;
+        unsigned u;
+    } fpint_st;
+
+};
+
+const float nvgraph_Const<float>::inf = FLT_MAX;
+const float nvgraph_Const<float>::tol = 1e-4;
+
+template <>
+struct nvgraph_Const<int>
+{ 
+    static const cudaDataType_t Type = CUDA_R_32I;
+    static const int inf;
+    static const int tol;
+
+};
+
+const int nvgraph_Const<int>::inf = INT_MAX;
+const int nvgraph_Const<int>::tol = 0;
+
+typedef struct SrSPMV_Usecase_t
+{
+    std::string graph_file;
+    int repeats;
+    SrSPMV_Usecase_t(const std::string& a, const int b) : graph_file(a), repeats(b){};
+    SrSPMV_Usecase_t& operator=(const SrSPMV_Usecase_t& rhs)
+    {
+        graph_file = rhs.graph_file;
+        repeats = rhs.repeats;
+        return *this;
+    }
+} SrSPMV_Usecase;
+
+template <typename T>
+void run_srspmv_bench(const SrSPMV_Usecase& param)
+{
+    std::cout << "Initializing nvGRAPH library..." << std::endl; 
+
+    nvgraphHandle_t handle = NULL;
+
+    if (handle == NULL) 
+    {
+        NVGRAPH_SAFE_CALL(nvgraphCreate(&handle));
+    }
+
+    std::cout << "Reading input data..." << std::endl;  
+
+    FILE* fpin = fopen(param.graph_file.c_str(),"r");
+    if (fpin == NULL)
+    {
+        std::cout << "Cannot open input graph file: " << param.graph_file << std::endl;  
+        exit(1);
+    } 
+
+    int m, n, nnz;
+    MM_typecode mc;
+ 
+    if(mm_properties<int>(fpin, 1, &mc, &m, &n, &nnz) != 0) 
+    {
+        std::cout <<  "could not read Matrix Market file properties"<< "\n";
+        exit(1);
+    }
+
+    std::vector<int> read_row_ptr(n+1), read_col_ind(nnz), coo_row_ind(nnz);
+    std::vector<T> csr_read_val(nnz);
+        
+    if(mm_to_coo<int,T>(fpin, 1, nnz, &coo_row_ind[0], &read_col_ind[0], &csr_read_val[0], NULL)) 
+    {
+        std::cout << "could not read matrix data"<< "\n";
+        exit(1);
+    }
+
+    if(coo_to_csr<int,T> (n, n, nnz, &coo_row_ind[0],  &read_col_ind[0], &csr_read_val[0], NULL, &read_row_ptr[0], NULL, NULL, NULL)) 
+    {
+        std::cout << "could not covert COO to CSR "<< "\n";
+        exit(1);
+    }
+
+    //Read a transposed network in amgx binary format and the bookmark of dangling nodes
+    /*if (read_header_amgx_csr_bin (fpin, n, nnz) != 0)
+    {
+        std::cout << "Error reading input file: " << param.graph_file << std::endl;  
+        exit(1);  
+    }
+    std::vector<int> read_row_ptr(n+1), read_col_ind(nnz);
+    std::vector<T> read_val(nnz);
+    if (read_data_amgx_csr_bin (fpin, n, nnz, read_row_ptr, read_col_ind, read_val) != 0)
+    {
+        std::cout << "Error reading input file: " << param.graph_file << std::endl;  
+        exit(1);  
+    }*/
+    fclose(fpin);
+
+    std::cout << "Initializing data structures ..." << std::endl;  
+
+    nvgraphGraphDescr_t g1 = NULL;
+    NVGRAPH_SAFE_CALL(nvgraphCreateGraphDescr(handle, &g1));  
+
+    // set up graph
+    nvgraphTopologyType_t topo = NVGRAPH_CSR_32;
+    nvgraphCSRTopology32I_st topology = {n, nnz, &read_row_ptr[0], &read_col_ind[0]};
+    NVGRAPH_SAFE_CALL(nvgraphSetGraphStructure(handle, g1, (void*)&topology, topo));
+
+    // set up graph data
+    std::vector<T> calculated_res(n);
+    std::vector<T> data1(n), data2(n);
+    for (int i = 0; i < n; i++)
+    {
+        data1[i] = (T)(1.0*rand()/RAND_MAX - 0.5);
+        data2[i] = (T)(1.0*rand()/RAND_MAX - 0.5);
+        //printf ("data1[%d]==%f, data2[%d]==%f\n", i, data1[i], i, data2[i]);
+    }
+    void*  vertexptr[2] = {(void*)&data1[0], (void*)&data2[0]};
+    cudaDataType_t type_v[2] = {nvgraph_Const<T>::Type, nvgraph_Const<T>::Type};
+    
+    void*  edgeptr[1] = {(void*)&csr_read_val[0]};
+    cudaDataType_t type_e[1] = {nvgraph_Const<T>::Type};
+
+    int weight_index = 0;
+    int x_index = 0;
+    int y_index = 1;
+    NVGRAPH_SAFE_CALL(nvgraphAllocateVertexData(handle, g1, 2, type_v ));
+    NVGRAPH_SAFE_CALL(nvgraphSetVertexData(handle, g1, vertexptr[0], x_index ));
+    NVGRAPH_SAFE_CALL(nvgraphSetVertexData(handle, g1, vertexptr[1], y_index ));
+    NVGRAPH_SAFE_CALL(nvgraphAllocateEdgeData(handle, g1, 1, type_e));
+    NVGRAPH_SAFE_CALL(nvgraphSetEdgeData(handle, g1, (void *)edgeptr[0], weight_index ));
+
+    // run
+    double start, stop, total = 0.;
+    T alphaT = 1., betaT = 0.;
+    nvgraphSemiring_t sr = NVGRAPH_PLUS_TIMES_SR;
+    int repeat = std::max(param.repeats, 1);
+    NVGRAPH_SAFE_CALL(nvgraphSrSpmv(handle, g1, weight_index, (void*)&alphaT, x_index, (void*)&betaT, y_index, sr));
+    NVGRAPH_SAFE_CALL(nvgraphSrSpmv(handle, g1, weight_index, (void*)&alphaT, x_index, (void*)&betaT, y_index, sr));
+    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    std::cout << "Running spmv for " << repeat << " times..." << std::endl;
+    std::cout << "n = " << n << ", nnz = " << nnz << std::endl;
+    for (int i = 0; i < repeat; i++)
+    {
+        start = second();
+        start = second();
+        NVGRAPH_SAFE_CALL(nvgraphSrSpmv(handle, g1, weight_index, (void*)&alphaT, x_index, (void*)&betaT, y_index, sr));
+        CUDA_SAFE_CALL(cudaDeviceSynchronize());
+        stop = second();
+        total += stop - start;
+    }
+    std::cout << "nvgraph time = " << 1000.*total/((double)repeat) << std::endl;
+
+    NVGRAPH_SAFE_CALL(nvgraphDestroyGraphDescr(handle, g1));
+
+    if (handle != NULL) 
+    {
+        NVGRAPH_SAFE_CALL(nvgraphDestroy(handle));
+        handle = NULL;
+    }
+}
+
+typedef struct WidestPath_Usecase_t
+{
+    std::string graph_file;
+    int source_vert;
+    int repeats;
+    WidestPath_Usecase_t(const std::string& a, int b, const int c) : graph_file(a), source_vert(b), repeats(c){};
+    WidestPath_Usecase_t& operator=(const WidestPath_Usecase_t& rhs)
+    {
+        graph_file = rhs.graph_file;
+        source_vert = rhs.source_vert;
+        repeats = rhs.repeats;
+        return *this;
+    }
+} WidestPath_Usecase;
+
+// ref functions taken from cuSparse
+template <typename T_ELEM>
+void ref_csr2csc (int m, int n, int nnz, const T_ELEM *csrVals, const int *csrRowptr, const int *csrColInd, T_ELEM *cscVals, int *cscRowind, int *cscColptr, int base=0){
+    int i,j, row, col, index;
+    int * counters;
+    T_ELEM val;
+
+    /* early return */
+    if ((m <= 0) || (n <= 0) || (nnz <= 0)){
+        return;
+    }
+
+    /* build compressed column pointers */
+    memset(cscColptr, 0, (n+1)*sizeof(cscColptr[0]));
+    cscColptr[0]=base;
+    for (i=0; i<nnz; i++){
+        cscColptr[1+csrColInd[i]-base]++;
+    }
+    for(i=0; i<n; i++){
+        cscColptr[i+1]+=cscColptr[i];
+    }
+
+    /* expand row indecis and copy them and values into csc arrays according to permutation */
+    counters = (int *)malloc(n*sizeof(counters[0]));
+    memset(counters, 0, n*sizeof(counters[0]));
+    for (i=0; i<m; i++){
+        for (j=csrRowptr[i]; j<csrRowptr[i+1]; j++){
+            row = i+base;
+            col = csrColInd[j-base];
+
+            index=cscColptr[col-base]-base+counters[col-base];
+            counters[col-base]++;
+
+            cscRowind[index]=row;
+
+            if(csrVals!=NULL || cscVals!=NULL){
+                val = csrVals[j-base];
+                cscVals[index]  = val;
+            }
+        }
+    }
+    free(counters);
+}
+
+template <typename T>
+void run_widest_bench(const WidestPath_Usecase& param)
+{
+    std::cout << "Initializing nvGRAPH library..." << std::endl; 
+
+    nvgraphHandle_t handle = NULL;
+
+    if (handle == NULL) 
+    {
+        NVGRAPH_SAFE_CALL(nvgraphCreate(&handle));
+    }
+
+    nvgraphTopologyType_t topo = NVGRAPH_CSC_32;
+
+    std::cout << "Reading input data..." << std::endl;  
+
+    FILE* fpin = fopen(param.graph_file.c_str(),"r");
+    if (fpin == NULL)
+    {
+        std::cout << "Cannot open input graph file: " << param.graph_file << std::endl;  
+        exit(1);
+    } 
+
+    int n, nnz;
+    //Read a transposed network in amgx binary format and the bookmark of dangling nodes
+    if (read_header_amgx_csr_bin (fpin, n, nnz) != 0)
+    {
+        std::cout << "Error reading input file: " << param.graph_file << std::endl;  
+        exit(1);  
+    }
+    std::vector<int> read_row_ptr(n+1), read_col_ind(nnz);
+    std::vector<T> read_val(nnz);
+    if (read_data_amgx_csr_bin (fpin, n, nnz, read_row_ptr, read_col_ind, read_val) != 0)
+    {
+        std::cout << "Error reading input file: " << param.graph_file << std::endl;  
+        exit(1);  
+    }
+    fclose(fpin);
+
+    std::cout << "Initializing data structures ..." << std::endl;  
+
+    nvgraphGraphDescr_t g1 = NULL;
+    NVGRAPH_SAFE_CALL(nvgraphCreateGraphDescr(handle, &g1));  
+
+    // set up graph
+    nvgraphCSCTopology32I_st topology = {n, nnz, &read_row_ptr[0], &read_col_ind[0]};
+    NVGRAPH_SAFE_CALL(nvgraphSetGraphStructure(handle, g1, (void*)&topology, topo));
+
+    // set up graph data
+    size_t numsets = 1;
+    std::vector<T> calculated_res(n);
+    //void*  vertexptr[1] = {(void*)&calculated_res[0]};
+    cudaDataType_t type_v[1] = {nvgraph_Const<T>::Type};
+    
+    void*  edgeptr[1] = {(void*)&read_val[0]};
+    cudaDataType_t type_e[1] = {nvgraph_Const<T>::Type};
+
+    NVGRAPH_SAFE_CALL(nvgraphAllocateVertexData(handle, g1, numsets, type_v));
+    NVGRAPH_SAFE_CALL(nvgraphAllocateEdgeData(handle, g1, numsets, type_e ));
+    NVGRAPH_SAFE_CALL(nvgraphSetEdgeData(handle, g1, (void *)edgeptr[0], 0 ));
+
+    int weight_index = 0;
+    int source_vert = param.source_vert;
+    int widest_path_index = 0;
+
+
+    // run
+    std::cout << "Running algorithm..." << std::endl;
+    double start, stop;
+    start = second();
+    start = second();
+    int repeat = std::max(param.repeats, 1);
+    for (int i = 0; i < repeat; i++)
+        NVGRAPH_SAFE_CALL(nvgraphWidestPath(handle, g1, weight_index, &source_vert, widest_path_index));
+    stop = second();
+    printf("Time of single WidestPath call is %10.8fsecs\n", (stop-start)/repeat);
+    
+    NVGRAPH_SAFE_CALL(nvgraphDestroyGraphDescr(handle, g1));
+
+    if (handle != NULL) 
+    {
+        NVGRAPH_SAFE_CALL(nvgraphDestroy(handle));
+        handle = NULL;
+    }
+}
+
+typedef struct SSSP_Usecase_t
+{
+    std::string graph_file;
+    int source_vert;
+    int repeats;
+    SSSP_Usecase_t(const std::string& a, int b, int c) : graph_file(a), source_vert(b), repeats(c){};
+    SSSP_Usecase_t& operator=(const SSSP_Usecase_t& rhs)
+    {
+        graph_file = rhs.graph_file;
+        source_vert = rhs.source_vert; 
+        repeats = rhs.repeats;
+        return *this;
+    } 
+} SSSP_Usecase;
+
+template <typename T>
+void run_sssp_bench(const SSSP_Usecase& param)
+{
+    std::cout << "Initializing nvGRAPH library..." << std::endl;  
+
+    nvgraphHandle_t handle = NULL;
+
+    if (handle == NULL) 
+    {
+        NVGRAPH_SAFE_CALL(nvgraphCreate(&handle));
+    }
+
+    nvgraphTopologyType_t topo = NVGRAPH_CSC_32;
+
+    std::cout << "Reading input data..." << std::endl; 
+
+    FILE* fpin = fopen(param.graph_file.c_str(),"r");
+    if (fpin == NULL)
+    {
+        std::cout << "Cannot read input graph file: " << param.graph_file << std::endl;  
+        exit(1);
+    } 
+
+    int n, nnz;
+    //Read a transposed network in amgx binary format and the bookmark of dangling nodes
+    if (read_header_amgx_csr_bin (fpin, n, nnz) != 0)
+    {
+        std::cout << "Error reading input file: " << param.graph_file << std::endl;  
+        exit(1);  
+    }
+    std::vector<int> read_row_ptr(n+1), read_col_ind(nnz);
+    std::vector<T> read_val(nnz);
+    if (read_data_amgx_csr_bin (fpin, n, nnz, read_row_ptr, read_col_ind, read_val) != 0)
+    {
+        std::cout << "Error reading input file: " << param.graph_file << std::endl;  
+        exit(1);  
+    }
+    fclose(fpin);
+
+    std::cout << "Initializing data structures ..." << std::endl;  
+
+    nvgraphGraphDescr_t g1 = NULL;
+    NVGRAPH_SAFE_CALL(nvgraphCreateGraphDescr(handle, &g1));  
+
+    // set up graph
+    nvgraphCSCTopology32I_st topology = {n, nnz, &read_row_ptr[0], &read_col_ind[0]};
+    NVGRAPH_SAFE_CALL(nvgraphSetGraphStructure(handle, g1, (void*)&topology, topo));
+
+    // set up graph data
+    size_t numsets = 1;
+    cudaDataType_t type_v[1] = {nvgraph_Const<T>::Type};
+    void*  edgeptr[1] = {(void*)&read_val[0]};
+    cudaDataType_t type_e[1] = {nvgraph_Const<T>::Type};
+
+    NVGRAPH_SAFE_CALL(nvgraphAllocateVertexData(handle, g1, numsets, type_v));
+    NVGRAPH_SAFE_CALL(nvgraphAllocateEdgeData(handle, g1, numsets, type_e ));
+    NVGRAPH_SAFE_CALL(nvgraphSetEdgeData(handle, g1, (void *)edgeptr[0], 0));
+
+    int weight_index = 0;
+    int source_vert = param.source_vert;
+    int sssp_index = 0;
+
+    // run
+    std::cout << "Running algorithm ..." << std::endl;
+    double start, stop;
+    start = second();
+    start = second();
+    int repeat = std::max(param.repeats, 1);
+    for (int i = 0; i < repeat; i++)
+        NVGRAPH_SAFE_CALL(nvgraphSssp(handle, g1, weight_index, &source_vert, sssp_index));
+    stop = second();
+    printf("Time of single SSSP call is %10.8fsecs\n", (stop-start)/repeat);
+    
+    NVGRAPH_SAFE_CALL(nvgraphDestroyGraphDescr(handle, g1));
+
+    if (handle != NULL) 
+    {
+        NVGRAPH_SAFE_CALL(nvgraphDestroy(handle));
+        handle = NULL;
+    }
+}
+
+typedef struct Traversal_Usecase_t
+{
+    std::string graph_file;
+    int source_vert;
+    int repeats;
+    Traversal_Usecase_t(const std::string& a, int b, int c) : graph_file(a), source_vert(b), repeats(c){};
+    Traversal_Usecase_t& operator=(const Traversal_Usecase_t& rhs)
+    {
+        graph_file = rhs.graph_file;
+        source_vert = rhs.source_vert; 
+        repeats = rhs.repeats;
+        return *this;
+    } 
+} Traversal_Usecase;
+
+
+template <typename T>
+void run_traversal_bench(const Traversal_Usecase& param)
+{
+    std::cout << "Initializing nvGRAPH library..." << std::endl;  
+
+    nvgraphHandle_t handle = NULL;
+
+    if (handle == NULL) 
+    {
+        NVGRAPH_SAFE_CALL(nvgraphCreate(&handle));
+    }
+
+    nvgraphTopologyType_t topo = NVGRAPH_CSR_32;
+
+    std::cout << "Reading input data..." << std::endl; 
+
+    FILE* fpin = fopen(param.graph_file.c_str(),"r");
+    if (fpin == NULL)
+    {
+        std::cout << "Cannot read input graph file: " << param.graph_file << std::endl;  
+        exit(1);
+    } 
+
+
+    //Read a transposed network in amgx binary format and the bookmark of dangling nodes
+    /*
+    if (read_header_amgx_csr_bin (fpin, n, nnz) != 0)
+    {
+        std::cout << "Error reading input file: " << param.graph_file << std::endl;  
+        exit(1);  
+    }
+   if (read_data_amgx_csr_bin (fpin, n, nnz, read_row_ptr, read_col_ind, csr_read_val) != 0)
+    {
+        std::cout << "Error reading input file: " << param.graph_file << std::endl;  
+        exit(1);  
+    }
+    fclose(fpin);
+    */
+    int m, n, nnz;
+    MM_typecode mc;
+ 
+    if(mm_properties<int>(fpin, 1, &mc, &m, &n, &nnz) != 0) {
+	std::cout <<  "could not read Matrix Market file properties"<< "\n";
+	exit(1);
+    }
+
+    std::vector<int> read_row_ptr(n+1), read_col_ind(nnz), coo_row_ind(nnz);
+    std::vector<T> csr_read_val(nnz);
+        
+       if(mm_to_coo<int,T>(fpin, 1, nnz, &coo_row_ind[0], &read_col_ind[0], &csr_read_val[0], NULL)) {
+	std::cout << "could not read matrix data"<< "\n";
+	exit(1);
+    }
+
+    if(coo_to_csr<int,T> (n, n, nnz, &coo_row_ind[0],  &read_col_ind[0], &csr_read_val[0], NULL, &read_row_ptr[0], NULL, NULL, NULL)) {
+	std::cout << "could not covert COO to CSR "<< "\n";
+	exit(1);
+    }
+
+      
+    std::cout << "Initializing data structures ..." << std::endl;  
+
+    nvgraphGraphDescr_t g1 = NULL;
+    NVGRAPH_SAFE_CALL(nvgraphCreateGraphDescr(handle, &g1));  
+
+    // set up graph
+    nvgraphCSRTopology32I_st topology = {n, nnz, &read_row_ptr[0], &read_col_ind[0]};
+    NVGRAPH_SAFE_CALL(nvgraphSetGraphStructure(handle, g1, (void*)&topology, topo));
+
+    // set up graph data
+    size_t numsets = 1;
+    cudaDataType_t type_v[1] = {nvgraph_Const<int>::Type};
+
+    NVGRAPH_SAFE_CALL(nvgraphAllocateVertexData(handle, g1, numsets, type_v));
+
+    int source_vert = param.source_vert;
+    nvgraphTraversalParameter_t traversal_param;
+    nvgraphTraversalParameterInit(&traversal_param);
+    nvgraphTraversalSetDistancesIndex(&traversal_param, 0);
+
+
+    // run
+    std::cout << "Running algorithm ..." << std::endl;
+    double start, stop;
+    start = second();
+    start = second();
+    int repeat = std::max(param.repeats, 1);
+    for (int i = 0; i < repeat; i++)
+        NVGRAPH_SAFE_CALL(nvgraphTraversal(handle, g1, NVGRAPH_TRAVERSAL_BFS, &source_vert, traversal_param));
+    stop = second();
+    printf("Time of single Traversal call is %10.8fsecs\n", (stop-start)/repeat);
+    
+    NVGRAPH_SAFE_CALL(nvgraphDestroyGraphDescr(handle, g1));
+
+    if (handle != NULL) 
+    {
+        NVGRAPH_SAFE_CALL(nvgraphDestroy(handle));
+        handle = NULL;
+    }
+}
+
+typedef struct Pagerank_Usecase_t
+{
+    std::string graph_file;
+    float alpha;
+    int repeats;
+    int max_iters;
+    double tolerance;
+    Pagerank_Usecase_t(const std::string& a, float b, const int c, const int d, const double e) : graph_file(a), alpha(b), repeats(c), max_iters(d), tolerance(e) {};
+    Pagerank_Usecase_t& operator=(const Pagerank_Usecase_t& rhs)
+    {
+        graph_file = rhs.graph_file;
+        alpha = rhs.alpha; 
+        repeats = rhs.repeats;
+        max_iters = rhs.max_iters;
+        tolerance = rhs.tolerance;
+        return *this;  
+    } 
+} Pagerank_Usecase;
+
+template <typename T>
+void run_pagerank_bench(const Pagerank_Usecase& param)
+{
+    std::cout << "Initializing nvGRAPH library..." << std::endl;  
+    nvgraphHandle_t handle = NULL;
+
+    if (handle == NULL) 
+    {
+        NVGRAPH_SAFE_CALL(nvgraphCreate(&handle));
+    }
+
+    nvgraphTopologyType_t topo = NVGRAPH_CSC_32;
+
+    std::cout << "Reading input data..." << std::endl;  
+
+    FILE* fpin = fopen(param.graph_file.c_str(),"r");
+    if (fpin == NULL)
+    {
+        std::cout << "Cannot open input graph file: " << param.graph_file << std::endl;  
+        exit(1);
+    } 
+    int n, nnz;
+    //Read a transposed network in amgx binary format and the bookmark of dangling nodes
+    if (read_header_amgx_csr_bin (fpin, n, nnz) != 0)
+    {
+        std::cout << "Cannot read input graph file: " << param.graph_file << std::endl;  
+        exit(1);  
+    }
+    std::vector<int> read_row_ptr(n+1), read_col_ind(nnz);
+    std::vector<T> read_val(nnz);
+    std::vector<T> dangling(n);
+    if (read_data_amgx_csr_bin_rhs (fpin, n, nnz, read_row_ptr, read_col_ind, read_val, dangling) != 0)
+    {
+        std::cout << "Cannot read input graph file: " << param.graph_file << std::endl;  
+        exit(1);
+    }
+    fclose(fpin);
+
+    std::cout << "Initializing data structures ..." << std::endl;  
+
+    nvgraphGraphDescr_t g1 = NULL;
+    NVGRAPH_SAFE_CALL(nvgraphCreateGraphDescr(handle, &g1));  
+
+    // set up graph
+    nvgraphCSCTopology32I_st topology = {n, nnz, &read_row_ptr[0], &read_col_ind[0]};
+    NVGRAPH_SAFE_CALL(nvgraphSetGraphStructure(handle, g1, (void*)&topology, topo));
+
+    // set up graph data
+    std::vector<T> calculated_res(n, (T)1.0/n);
+    void*  vertexptr[2] = {(void*)&dangling[0], (void*)&calculated_res[0]};
+    cudaDataType_t type_v[2] = {nvgraph_Const<T>::Type, nvgraph_Const<T>::Type};
+    
+    void*  edgeptr[1] = {(void*)&read_val[0]};
+    cudaDataType_t type_e[1] = {nvgraph_Const<T>::Type};
+
+    NVGRAPH_SAFE_CALL(nvgraphAllocateVertexData(handle, g1, 2, type_v));
+    NVGRAPH_SAFE_CALL(nvgraphSetVertexData(handle, g1, vertexptr[0], 0 ));
+    NVGRAPH_SAFE_CALL(nvgraphSetVertexData(handle, g1, vertexptr[1], 1 ));
+    NVGRAPH_SAFE_CALL(nvgraphAllocateEdgeData(handle, g1, 1, type_e ));
+    NVGRAPH_SAFE_CALL(nvgraphSetEdgeData(handle, g1, (void *)edgeptr[0], 0 ));
+
+    int bookmark_index = 0;
+    int weight_index = 0;
+    T alpha = param.alpha;
+    int pagerank_index = 1;
+    int has_guess = 0;
+    float tolerance = (T)param.tolerance;
+    int max_iter = param.max_iters;
+
+    std::cout << "Running algorithm ..." << std::endl;  
+    // run
+    double start, stop;
+    start = second();
+    start = second();
+    int repeat = std::max(param.repeats, 1);
+    for (int i = 0; i < repeat; i++)
+        NVGRAPH_SAFE_CALL(nvgraphPagerank(handle, g1, weight_index, (void*)&alpha, bookmark_index, has_guess, pagerank_index, tolerance, max_iter));
+    stop = second();
+    printf("Time of single Pargerank call is %10.8fsecs\n", (stop-start)/repeat);
+    
+    NVGRAPH_SAFE_CALL(nvgraphDestroyGraphDescr(handle, g1));
+
+    if (handle != NULL) 
+    {
+        NVGRAPH_SAFE_CALL(nvgraphDestroy(handle));
+        handle = NULL;
+    }
+}
+
+typedef struct ModMax_Usecase_t
+{
+    std::string graph_file;
+    int clusters;
+    int evals;
+    int repeats;
+    ModMax_Usecase_t(const std::string& a, int b, int c, int d) : graph_file(a), clusters(b), evals(c), repeats(d){};
+    ModMax_Usecase_t& operator=(const ModMax_Usecase_t& rhs)
+    {
+        graph_file = rhs.graph_file;
+        clusters = rhs.clusters;
+        evals = rhs.evals;
+        repeats = rhs.repeats;
+        return *this;
+    }
+} ModMax_Usecase;
+
+template <typename T> 
+void run_modularity_bench(const ModMax_Usecase& param)
+{
+     // this function prints :
+     // #clusters,time in ms,modularity
+
+     nvgraphHandle_t handle = NULL;
+     NVGRAPH_SAFE_CALL(nvgraphCreate(&handle));
+
+    int m, n, nnz;
+    MM_typecode mc;
+
+    FILE* fpin = fopen(param.graph_file.c_str(),"r");
+    
+    mm_properties<int>(fpin, 1, &mc, &m, &n, &nnz) ;
+
+    // Allocate memory on host
+    std::vector<int> cooRowIndA(nnz);
+    std::vector<int> csrColIndA(nnz);
+    std::vector<int> csrRowPtrA(n+1);
+    std::vector<T> csrValA(nnz);
+
+    mm_to_coo<int,T>(fpin, 1, nnz, &cooRowIndA[0], &csrColIndA[0], &csrValA[0],NULL) ;
+    coo_to_csr<int,T> (n, n, nnz, &cooRowIndA[0],  &csrColIndA[0], &csrValA[0], NULL, &csrRowPtrA[0], NULL, NULL, NULL);
+    fclose(fpin);        
+
+     //remove diagonal
+     for (int i = 0; i < n; i++)
+        for (int j = csrRowPtrA[i]; j < csrRowPtrA[i+1]; j++)
+            if (csrColIndA[j]==i)
+                csrValA[j] = 0.0;
+
+     nvgraphGraphDescr_t g1 = NULL;
+
+     struct SpectralClusteringParameter clustering_params;
+     clustering_params.n_clusters = param.clusters; 
+     clustering_params.n_eig_vects = param.evals; 
+     clustering_params.algorithm = NVGRAPH_MODULARITY_MAXIMIZATION; 
+     clustering_params.evs_tolerance = 0.0f ;
+     clustering_params.evs_max_iter = 0;
+     clustering_params.kmean_tolerance = 0.0f; 
+     clustering_params.kmean_max_iter = 0;
+
+    int weight_index = 0; 
+   
+    //std::vector<T> clustering_h(n);
+    //std::vector<T> eigVals_h(clustering_params.n_clusters);
+    //std::vector<T> eigVecs_h(n*clustering_params.n_clusters);
+
+    //could also be on device
+    int *clustering_d; cudaMalloc((void**)&clustering_d , n*sizeof(int));
+    T* eigVals_d; cudaMalloc((void**)&eigVals_d, clustering_params.n_clusters*sizeof(T));
+    T* eigVecs_d; cudaMalloc((void**)&eigVecs_d, n*clustering_params.n_clusters*sizeof(T));
+    
+    NVGRAPH_SAFE_CALL( nvgraphCreateGraphDescr(handle, &g1));  
+
+    // set up graph
+    nvgraphCSRTopology32I_st topology = {n, nnz, &csrRowPtrA[0], &csrColIndA[0]};
+    NVGRAPH_SAFE_CALL( nvgraphSetGraphStructure(handle, g1, (void*)&topology, NVGRAPH_CSR_32));
+
+    // set up graph data
+    size_t numsets = 1;
+    void*  edgeptr[1] = {(void*)&csrValA[0]};
+    cudaDataType_t type_e[1] = {nvgraph_Const<T>::Type};
+    NVGRAPH_SAFE_CALL( nvgraphAllocateEdgeData(handle, g1, numsets, type_e ));
+    NVGRAPH_SAFE_CALL( nvgraphSetEdgeData(handle, g1, (void *)edgeptr[0], 0 ));   
+    
+    printf("%d,", clustering_params.n_clusters);
+
+    double start, stop;
+    start = second();
+    int repeat = std::max(param.repeats, 1);
+    for (int i = 0; i < repeat; i++)
+     // NVGRAPH_SAFE_CALL(nvgraphSpectralClustering(handle, g1, weight_index, &clustering_params, (int*)&clustering_h[0], (void*)&eigVals_h[0], (void*)&eigVecs_h[0])); 
+      NVGRAPH_SAFE_CALL(nvgraphSpectralClustering(handle, g1, weight_index, &clustering_params, clustering_d, eigVals_d, eigVecs_d));
+    //for (int i = 0; i < repeat; i++)
+       // NVGRAPH_SAFE_CALL( nvgraphSpectralModularityMaximization(handle, g1, weight_index, clustering_params.n_clusters, clustering_params.n_eig_vects, 0.0f, 0, 0.0f, 0, clustering_d, (void*)&eigVals_h[0], (void*)&eigVecs_h[0])); 
+    //for (int i = 0; i < repeat; i++)
+        // NVGRAPH_SAFE_CALL( nvgraphBalancedCutClustering(handle, g1, weight_index, clustering_params.n_clusters, clustering_params.n_eig_vects, 0, 0.0f, 0, 0.0f, 0, clustering_d, (void*)&eigVals_h[0], (void*)&eigVecs_h[0])); 
+    stop = second();
+    printf("%10.8f,", 1000.0*(stop-start)/repeat);
+
+    //Print
+    //std::vector<int> clust_h(n);
+    //cudaMemcpy(&clust_h[0], clustering_d,n*sizeof(int),cudaMemcpyDeviceToHost);
+    //printf("\n ");
+    //for (int i = 0; i < n; ++i)
+    //   printf("%d ", clust_h [i]);
+    //printf("\n ");
+    //for (int i = 0; i < clustering_params.n_clusters; ++i)
+    //    std::cout << eigVals_h[i]<< ' ' ;
+    //printf("\n ");
+    //std::cout<< std::endl;
+    //std::cout << std::endl;
+    //for (int i = 0; i < clustering_params.n_clusters; ++i)
+    //{
+    //    for (int j = 0; j < 10; ++j)
+    //        std::cout << eigVecs_h[i*n+j] << ' '; 
+    //    std::cout<< std::endl;
+    //}
+
+    // Analyse quality
+    float score =0.0;
+    nvgraphAnalyzeClustering(handle, g1, weight_index, clustering_params.n_clusters, clustering_d, NVGRAPH_MODULARITY, &score);  
+    printf("%f\n", score);
+
+    // ratio cut
+    // float ec =0.0, rc =0.0;
+    // NVGRAPH_SAFE_CALL(nvgraphAnalyzeBalancedCut(handle, g1, weight_index, clustering_params.n_clusters, clustering_d, &ec, &rc));  
+    // printf("%f,", rc);
+    
+    // // Synthetic random 
+    // for (int i=0; i<n; i++)
+    // {
+    //     parts_h[i] = rand() % clustering_params.n_clusters;
+    //     //printf("%d ", parts_h[i]);
+    // }
+    // // Analyse quality
+    // cudaMemcpy(clustering_d,&parts_h[0],n*sizeof(int),cudaMemcpyHostToDevice);
+    // //NVGRAPH_SAFE_CALL( nvgraphAnalyzeModularityClustering(handle, g1, weight_index, clustering_params.n_clusters, clustering_d, &modularity1));  
+    // //printf("%f\n", modularity1);
+    // NVGRAPH_SAFE_CALL(nvgraphAnalyzeBalancedCut(handle, g1, weight_index, clustering_params.n_clusters, clustering_d, &ec, &rc));  
+    // printf("%f\n", rc);
+
+    //exit
+    cudaFree(clustering_d);
+    cudaFree(eigVals_d);
+    cudaFree(eigVecs_d);
+    NVGRAPH_SAFE_CALL(nvgraphDestroyGraphDescr(handle, g1));
+}
+
+typedef struct BalancedCut_Usecase_t
+{
+    std::string graph_file;
+    int clusters;
+    int evals;
+    int repeats;
+    BalancedCut_Usecase_t(const std::string& a, int b, int c, int d) : graph_file(a), clusters(b), evals(c), repeats(d){};
+    BalancedCut_Usecase_t& operator=(const BalancedCut_Usecase_t& rhs)
+    {
+        graph_file = rhs.graph_file;
+        clusters = rhs.clusters;
+        evals = rhs.evals;
+        repeats = rhs.repeats;
+        return *this;
+    }
+} BalancedCut_Usecase;
+
+template <typename T> 
+void run_balancedCut_bench(const BalancedCut_Usecase& param)
+{
+     // this function prints :
+     // #clusters,time in ms,rc
+
+     nvgraphHandle_t handle = NULL;
+     NVGRAPH_SAFE_CALL(nvgraphCreate(&handle));
+
+    int m, n, nnz;
+    MM_typecode mc;
+
+    FILE* fpin = fopen(param.graph_file.c_str(),"r");
+    
+    mm_properties<int>(fpin, 1, &mc, &m, &n, &nnz) ;
+
+    // Allocate memory on host
+    std::vector<int> cooRowIndA(nnz);
+    std::vector<int> csrColIndA(nnz);
+    std::vector<int> csrRowPtrA(n+1);
+    std::vector<T> csrValA(nnz);
+
+    mm_to_coo<int,T>(fpin, 1, nnz, &cooRowIndA[0], &csrColIndA[0], &csrValA[0],NULL) ;
+    coo_to_csr<int,T> (n, n, nnz, &cooRowIndA[0],  &csrColIndA[0], &csrValA[0], NULL, &csrRowPtrA[0], NULL, NULL, NULL);
+    fclose(fpin);        
+
+     //remove diagonal
+     for (int i = 0; i < n; i++)
+        for (int j = csrRowPtrA[i]; j < csrRowPtrA[i+1]; j++)
+            if (csrColIndA[j]==i)
+                csrValA[j] = 0.0;
+
+     nvgraphGraphDescr_t g1 = NULL;
+
+     struct SpectralClusteringParameter clustering_params;
+     clustering_params.n_clusters = param.clusters; 
+     clustering_params.n_eig_vects = param.evals; 
+     clustering_params.algorithm = NVGRAPH_BALANCED_CUT_LANCZOS; 
+     clustering_params.evs_tolerance = 0.0f ;
+     clustering_params.evs_max_iter = 0;
+     clustering_params.kmean_tolerance = 0.0f; 
+     clustering_params.kmean_max_iter = 0;
+
+    int weight_index = 0; 
+   
+    //std::vector<T> clustering_h(n);
+    //std::vector<T> eigVals_h(clustering_params.n_clusters);
+    //std::vector<T> eigVecs_h(n*clustering_params.n_clusters);
+
+    //could also be on device
+    int *clustering_d; cudaMalloc((void**)&clustering_d , n*sizeof(int));
+    T* eigVals_d; cudaMalloc((void**)&eigVals_d, clustering_params.n_clusters*sizeof(T));
+    T* eigVecs_d; cudaMalloc((void**)&eigVecs_d, n*clustering_params.n_clusters*sizeof(T));
+    
+    NVGRAPH_SAFE_CALL( nvgraphCreateGraphDescr(handle, &g1));  
+
+    // set up graph
+    nvgraphCSRTopology32I_st topology = {n, nnz, &csrRowPtrA[0], &csrColIndA[0]};
+    NVGRAPH_SAFE_CALL( nvgraphSetGraphStructure(handle, g1, (void*)&topology, NVGRAPH_CSR_32));
+
+    // set up graph data
+    size_t numsets = 1;
+    void*  edgeptr[1] = {(void*)&csrValA[0]};
+    cudaDataType_t type_e[1] = {nvgraph_Const<T>::Type};
+    NVGRAPH_SAFE_CALL( nvgraphAllocateEdgeData(handle, g1, numsets, type_e ));
+    NVGRAPH_SAFE_CALL( nvgraphSetEdgeData(handle, g1, (void *)edgeptr[0], 0 ));   
+    
+    printf("%d,", clustering_params.n_clusters);
+
+    double start, stop;
+    start = second();
+    int repeat = std::max(param.repeats, 1);
+    for (int i = 0; i < repeat; i++)
+     // NVGRAPH_SAFE_CALL(nvgraphSpectralClustering(handle, g1, weight_index, &clustering_params, (int*)&clustering_h[0], (void*)&eigVals_h[0], (void*)&eigVecs_h[0])); 
+      NVGRAPH_SAFE_CALL(nvgraphSpectralClustering(handle, g1, weight_index, &clustering_params, clustering_d, eigVals_d, eigVecs_d));
+    stop = second();
+    printf("%10.8f,", 1000.0*(stop-start)/repeat);
+
+    // Analyse quality
+    float score =0.0;
+    nvgraphAnalyzeClustering(handle, g1, weight_index, clustering_params.n_clusters, clustering_d, NVGRAPH_RATIO_CUT, &score);  
+    printf("%f\n", score);
+
+    //exit
+    cudaFree(clustering_d);
+    cudaFree(eigVals_d);
+    cudaFree(eigVecs_d);
+
+    NVGRAPH_SAFE_CALL(nvgraphDestroyGraphDescr(handle, g1));
+}
+
+typedef struct TriCount_Usecase_t
+{
+    std::string graph_file;
+    int repeats;
+    TriCount_Usecase_t(const std::string& a, const int b) : graph_file(a), repeats(b){};
+    TriCount_Usecase_t& operator=(const TriCount_Usecase_t& rhs)
+    {
+        graph_file = rhs.graph_file;
+        repeats = rhs.repeats;
+        return *this;
+    }
+} TriCount_Usecase;
+
+void run_tricount_bench(const TriCount_Usecase& param)
+{
+    std::cout << "Initializing nvGRAPH library..." << std::endl; 
+
+    nvgraphHandle_t handle = NULL;
+
+    if (handle == NULL) 
+    {
+        NVGRAPH_SAFE_CALL(nvgraphCreate(&handle));
+    }
+
+    nvgraphTopologyType_t topo = NVGRAPH_CSR_32;
+
+    std::cout << "Reading input data..." << std::endl;  
+
+    FILE* fpin = fopen(param.graph_file.c_str(),"rb");
+    if (fpin == NULL)
+    {
+        std::cout << "Cannot open input graph file: " << param.graph_file << std::endl;  
+        exit(1);
+    } 
+
+    int n, nnz;
+    std::vector<int> read_row_ptr, read_col_ind;
+    //Read CSR of lower triangular of undirected graph
+    if (read_csr_bin<int> (fpin, n, nnz, read_row_ptr, read_col_ind) != 0)
+    {
+        std::cout << "Error reading input file: " << param.graph_file << std::endl;  
+        exit(1);  
+    }
+    fclose(fpin);
+
+    std::cout << "Initializing data structures ..." << std::endl;  
+
+    nvgraphGraphDescr_t g1 = NULL;
+    NVGRAPH_SAFE_CALL(nvgraphCreateGraphDescr(handle, &g1));  
+
+    // set up graph
+    nvgraphCSRTopology32I_st topology = {n, nnz, &read_row_ptr[0], &read_col_ind[0]};
+    NVGRAPH_SAFE_CALL(nvgraphSetGraphStructure(handle, g1, (void*)&topology, topo));
+
+    // set up graph data
+    uint64_t res = 0;
+    // run
+    std::cout << "Running algorithm..." << std::endl;
+    double start, stop;
+    start = second();
+    start = second();
+    int repeat = std::max(param.repeats, 1);
+    for (int i = 0; i < repeat; i++)
+        NVGRAPH_SAFE_CALL(nvgraphTriangleCount(handle, g1, &res));
+    stop = second();
+    printf("Number of triangles counted: %lli\n", (long long int)res);
+    printf("Time of single TriangleCount call is %10.8fsecs\n", (stop-start)/repeat);
+    
+    NVGRAPH_SAFE_CALL(nvgraphDestroyGraphDescr(handle, g1));
+
+    if (handle != NULL) 
+    {
+        NVGRAPH_SAFE_CALL(nvgraphDestroy(handle));
+        handle = NULL;
+    }
+}
+
+
+int findParamIndex(const char** argv, int argc, const char* parm)
+{
+    int count = 0;
+    int index = -1;
+
+    for (int i = 0; i < argc; i++) 
+    {
+        if (strncmp(argv[i], parm, 100)==0)
+        {
+            index = i;
+            count++;
+        }
+    }
+
+    if (count == 0 || count == 1) 
+    {
+        return index;
+    }
+    else 
+    {
+        printf("Error, parameter %s has been specified more than once, exiting\n",parm);
+        exit(1);
+    }
+
+    return -1;
+}
+
+int main(int argc, const char **argv) 
+{
+    int pidx = 0;
+    int repeats = 100;
+
+    if (argc < 2 || findParamIndex(argv, argc, "--help") != -1)
+    {
+        printf("Usage:                                                                                                  \n");
+        printf("    nvgraph_benchmark [--double|--float] [--repeats N] --spmv     graph_file                            \n");
+        printf("    nvgraph_benchmark [--double|--float] [--repeats N] --widest   graph_file start_vertex               \n");
+        printf("    nvgraph_benchmark [--double|--float] [--repeats N] --sssp     graph_file start_vertex               \n");
+        printf("    nvgraph_benchmark [--double|--float] [--repeats N] --pagerank graph_file alpha max_iters tolerance  \n");
+        printf("    nvgraph_benchmark [--double|--float] [--repeats N] --modularity  graph_file nb_clusters nb_eigvals  \n");
+        printf("    nvgraph_benchmark [--double|--float] [--repeats N] --traversal   graph_file start_vertex            \n");
+        printf("    nvgraph_benchmark [--double|--float] [--repeats N] --balancedCut  graph_file nb_clusters nb_eigvals \n");
+        printf("    nvgraph_benchmark                    [--repeats N] --tricount   graph_file                          \n");
+        exit(0);
+    }
+
+    if ( (pidx = findParamIndex(argv, argc, "--repeats")) != -1)
+    {
+        repeats = atoi(argv[pidx+1]);
+    }
+
+    if (findParamIndex(argv, argc, "--double") != -1 || findParamIndex(argv, argc, "--float") == -1)
+    {
+        if ((pidx = findParamIndex(argv, argc, "--widest")) != -1)
+        {
+            run_widest_bench<double>(WidestPath_Usecase(argv[pidx+1], atoi(argv[pidx+2]), repeats));
+        }
+        else if ((pidx = findParamIndex(argv, argc, "--spmv")) != -1)
+        {
+            run_srspmv_bench<double>(SrSPMV_Usecase(argv[pidx+1], repeats));
+        }
+        else if ((pidx = findParamIndex(argv, argc, "--sssp")) != -1)
+        {
+            run_sssp_bench<double>(SSSP_Usecase(argv[pidx+1], atoi(argv[pidx+2]), repeats));
+        }
+        else if ((pidx = findParamIndex(argv, argc, "--pagerank")) != -1)
+        {
+            run_pagerank_bench<double>(Pagerank_Usecase(argv[pidx+1], atof(argv[pidx+2]), repeats, atoi(argv[pidx+3]), atof(argv[pidx+4])));
+        }
+         else if ((pidx = findParamIndex(argv, argc, "--modularity")) != -1)
+        {
+            run_modularity_bench<double>(ModMax_Usecase(argv[pidx+1], atoi(argv[pidx+2]), atoi(argv[pidx+3]), repeats));
+        }
+        else if ((pidx = findParamIndex(argv, argc, "--traversal")) != -1)
+        {
+            run_traversal_bench<double>(Traversal_Usecase(argv[pidx+1], atoi(argv[pidx+2]), repeats));
+        } 
+        else if ((pidx = findParamIndex(argv, argc, "--balancedCut")) != -1)
+        {
+            run_balancedCut_bench<double>(BalancedCut_Usecase(argv[pidx+1], atoi(argv[pidx+2]), atoi(argv[pidx+3]), repeats));
+        }
+        else if ((pidx = findParamIndex(argv, argc, "--tricount")) != -1)
+        {
+            run_tricount_bench(TriCount_Usecase(argv[pidx+1], repeats));
+        }
+	else
+        {
+            printf("Specify one of the algorithms: '--widest', '--sssp', '--pagerank', '--modularity', '--balancedCut', '--traversal', or 'tricount'\n");
+        }
+    }
+    else
+    {
+        if ((pidx = findParamIndex(argv, argc, "--widest")) != -1)
+        {
+            run_widest_bench<float>(WidestPath_Usecase(argv[pidx+1], atoi(argv[pidx+2]), repeats));
+        }
+        else if ((pidx = findParamIndex(argv, argc, "--spmv")) != -1)
+        {
+            run_srspmv_bench<float>(SrSPMV_Usecase(argv[pidx+1], repeats));
+        }
+        else if ((pidx = findParamIndex(argv, argc, "--sssp")) != -1)
+        {
+            run_sssp_bench<float>(SSSP_Usecase(argv[pidx+1], atoi(argv[pidx+2]), repeats));
+        }
+        else if ((pidx = findParamIndex(argv, argc, "--pagerank")) != -1)
+        {
+            run_pagerank_bench<float>(Pagerank_Usecase(argv[pidx+1], atof(argv[pidx+2]), repeats, atoi(argv[pidx+3]), atof(argv[pidx+4])));
+        }
+        else if ((pidx = findParamIndex(argv, argc, "--modularity")) != -1)
+        {
+            run_modularity_bench<float>(ModMax_Usecase(argv[pidx+1], atoi(argv[pidx+2]), atoi(argv[pidx+3]), repeats));
+        }
+        else if ((pidx = findParamIndex(argv, argc, "--traversal")) != -1)
+        {
+            run_traversal_bench<float>(Traversal_Usecase(argv[pidx+1], atoi(argv[pidx+2]), repeats));
+        }
+        else if ((pidx = findParamIndex(argv, argc, "--balancedCut")) != -1)
+        {
+            run_balancedCut_bench<float>(BalancedCut_Usecase(argv[pidx+1], atoi(argv[pidx+2]), atoi(argv[pidx+3]), repeats));
+        }
+	else
+        {
+            printf("Specify one of the algorithms: '--widest', '--sssp' , '--pagerank', '--modularity', '--balancedCut' or '--traversal'\n");
+        }
+    }
+
+    return 0;
+}
+
diff --git a/cpp/nvgraph/cpp/tests/nvgraph_capi_tests_2d_bfs.cpp b/cpp/nvgraph/cpp/tests/nvgraph_capi_tests_2d_bfs.cpp
new file mode 100644
index 00000000000..044701ccabb
--- /dev/null
+++ b/cpp/nvgraph/cpp/tests/nvgraph_capi_tests_2d_bfs.cpp
@@ -0,0 +1,762 @@
+// This is gtest application that contains all of the C API tests. Parameters:
+// nvgraph_capi_tests [--perf] [--stress-iters N] [--gtest_filter=NameFilterPatter]
+// It also accepts any other gtest (1.7.0) default parameters.
+// Right now this application contains:
+// 1) Sanity Check tests - tests on simple examples with known answer (or known behaviour)
+// 2) Correctness checks tests - tests on real graph data, uses reference algorithm 
+//    (CPU code for SrSPMV and python scripts for other algorithms, see 
+//    python scripts here: //sw/gpgpu/nvgraph/test/ref/) with reference results, compares those two.
+//    It also measures performance of single algorithm C API call, enf enabled (see below)
+// 3) Corner cases tests - tests with some bad inputs, bad parameters, expects library to handle 
+//    it gracefully
+// 4) Stress tests - makes sure that library result is persistent throughout the library usage
+//    (a lot of C API calls). Also makes some assumptions and checks on memory usage during 
+//    this test.
+//
+// We can control what tests to launch by using gtest filters. For example:
+// Only sanity tests:
+//    ./nvgraph_capi_tests_traversal --gtest_filter=*Sanity*
+// And, correspondingly:
+//    ./nvgraph_capi_tests_traversal --gtest_filter=*Correctness*
+//    ./nvgraph_capi_tests_traversal --gtest_filter=*Corner*
+//    ./nvgraph_capi_tests_traversal --gtest_filter=*Stress*
+// Or, combination:
+//    ./nvgraph_capi_tests_traversal --gtest_filter=*Sanity*:*Correctness*
+//
+// Performance reports are provided in the ERIS format and disabled by default. 
+// Could be enabled by adding '--perf' to the command line. I added this parameter to vlct
+//
+// Parameter '--stress-iters N', which gives multiplier (not an absolute value) for the number of launches for stress tests
+//
+
+#include <utility>
+
+#include "gtest/gtest.h"
+
+#include "nvgraph_test_common.h"
+
+#include "valued_csr_graph.hxx"
+#include "readMatrix.hxx"
+#include "nvgraphP.h"
+#include "nvgraph.h"
+#include <nvgraph_experimental.h>  // experimental header, contains hidden API entries, can be shared only under special circumstances without reveling internal things
+
+#include "stdlib.h"
+#include <algorithm>
+#include <numeric>
+#include <queue>
+#include <cstdint>
+#include <math.h>
+
+// do the perf measurements, enabled by command line parameter '--perf'
+static int PERF = 0;
+
+// minimum vertices in the graph to perform perf measurements
+#define PERF_ROWS_LIMIT 10000
+
+// number of repeats = multiplier/num_vertices
+#define Traversal_ITER_MULTIPLIER     30000000
+
+template<typename T>
+struct nvgraph_Const;
+
+template<>
+struct nvgraph_Const<int>
+{
+	static const cudaDataType_t Type = CUDA_R_32I;
+	static const int inf;
+};
+const int nvgraph_Const<int>::inf = INT_MAX;
+
+static std::string ref_data_prefix = "";
+static std::string graph_data_prefix = "";
+
+// iterations for stress tests = this multiplier * iterations for perf tests
+static int STRESS_MULTIPLIER = 10;
+
+void offsetsToIndices(std::vector<int>& offsets, std::vector<int>& indices) {
+	int nnz = offsets.back();
+	indices.resize(nnz);
+	int n = offsets.size() - 1;
+	for (int row = 0; row < n; row++) {
+		for (int pos = offsets[row]; pos < offsets[row + 1]; pos++)
+			indices[pos] = row;
+	}
+}
+
+bool enough_device_memory(int n, int nnz, size_t add)
+									{
+	size_t mtotal, mfree;
+	cudaMemGetInfo(&mfree, &mtotal);
+	if (mfree > add + sizeof(int) * (4 * n)) //graph + pred + distances + 2n (working data)
+		return true;
+	return false;
+}
+
+std::string convert_to_local_path(const std::string& in_file)
+												{
+	std::string wstr = in_file;
+	if ((wstr != "dummy") & (wstr != ""))
+			{
+		std::string prefix;
+		if (graph_data_prefix.length() > 0)
+				{
+			prefix = graph_data_prefix;
+		}
+		else
+		{
+#ifdef _WIN32
+			//prefix = "C:\\mnt\\eris\\test\\matrices_collection\\";
+			prefix = "Z:\\matrices_collection\\";
+			std::replace(wstr.begin(), wstr.end(), '/', '\\');
+#else
+			prefix = "/mnt/nvgraph_test_data/";
+#endif
+		}
+		wstr = prefix + wstr;
+	}
+	return wstr;
+}
+
+void ref_bfs(int n,
+					int nnz,
+					int *rowPtr,
+					int *colInd,
+					int *mask,
+					int source_vertex,
+					int *distances) {
+	for (int i = 0; i != n; ++i)
+		distances[i] = -1;
+
+	std::queue<int> q;
+	q.push(source_vertex);
+	distances[source_vertex] = 0;
+
+	while (!q.empty()) {
+		int u = q.front();
+		q.pop();
+
+		for (int iCol = rowPtr[u]; iCol != rowPtr[u + 1]; ++iCol) {
+			if (mask && !mask[iCol])
+				continue;
+			int v = colInd[iCol];
+			if (distances[v] == -1) { //undiscovered
+				distances[v] = distances[u] + 1;
+				q.push(v);
+			}
+		}
+
+	}
+}
+
+typedef struct Traversal_Usecase_t
+{
+	std::string graph_file;
+	int source_vert;
+	bool useMask;
+	bool undirected;
+
+	Traversal_Usecase_t(const std::string& a, int b, bool _useMask = false, bool _undirected = false) :
+			source_vert(b), useMask(_useMask), undirected(_undirected) {
+		graph_file = convert_to_local_path(a);
+	}
+	;
+
+	Traversal_Usecase_t& operator=(const Traversal_Usecase_t& rhs)
+												{
+		graph_file = rhs.graph_file;
+		source_vert = rhs.source_vert;
+		useMask = rhs.useMask;
+		return *this;
+	}
+} Traversal_Usecase;
+
+//// Traversal tests
+
+class NVGraphCAPITests_2d_bfs: public ::testing::TestWithParam<Traversal_Usecase> {
+public:
+	NVGraphCAPITests_2d_bfs() :
+			handle(NULL) {
+	}
+
+	static void SetupTestCase() {
+	}
+	static void TearDownTestCase() {
+	}
+	virtual void SetUp() {
+		if (handle == NULL) {
+			char* nvgraph_gpus = getenv("NVGRAPH_GPUS");
+			if (nvgraph_gpus)
+				printf("Value of NVGRAPH_GPUS=%s\n", nvgraph_gpus);
+			else
+				printf("Value of NVGRAPH_GPUS is null\n");
+			std::vector<int32_t> gpus;
+			int32_t dummy;
+			std::stringstream ss(nvgraph_gpus);
+			while (ss >> dummy) {
+				gpus.push_back(dummy);
+				if (ss.peek() == ',')
+					ss.ignore();
+			}
+			printf("There were %d devices found: ", (int) gpus.size());
+			for (int i = 0; i < gpus.size(); i++)
+				std::cout << gpus[i] << "  ";
+			std::cout << "\n";
+
+			devices = (int32_t*) malloc(sizeof(int32_t) * gpus.size());
+			for (int i = 0; i < gpus.size(); i++)
+				devices[i] = gpus[i];
+			numDevices = gpus.size();
+
+			status = nvgraphCreateMulti(&handle, numDevices, devices);
+			ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		}
+	}
+	virtual void TearDown() {
+		if (handle != NULL) {
+			status = nvgraphDestroy(handle);
+			ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+			handle = NULL;
+			if (devices)
+				free(devices);
+		}
+	}
+	nvgraphStatus_t status;
+	nvgraphHandle_t handle;
+	int32_t *devices;
+	int32_t numDevices;
+
+	template<typename EdgeT>
+	void run_current_test(const Traversal_Usecase& param) {
+		const ::testing::TestInfo* const test_info =
+				::testing::UnitTest::GetInstance()->current_test_info();
+		std::stringstream ss;
+		ss << param.source_vert;
+		std::string test_id = std::string(test_info->test_case_name()) + std::string(".")
+				+ std::string(test_info->name()) + std::string("_") + getFileName(param.graph_file)
+				+ std::string("_") + ss.str().c_str();
+
+		nvgraphTopologyType_t topo = NVGRAPH_2D_32I_32I;
+
+		nvgraphStatus_t status;
+
+		FILE* fpin = fopen(param.graph_file.c_str(), "rb");
+		ASSERT_TRUE(fpin != NULL)<< "Cannot read input graph file: " << param.graph_file << std::endl;
+		int n, nnz;
+		//Read a network in amgx binary format
+		ASSERT_EQ(read_header_amgx_csr_bin(fpin, n, nnz), 0);
+		std::vector<int> read_row_ptr(n + 1), read_col_ind(nnz);
+		std::vector<EdgeT> csr_read_val(nnz);
+		std::cout << getFileName(param.graph_file) << " Vertices: " << n << " Edges: " << nnz << "\n";
+		ASSERT_EQ(read_data_amgx_csr_bin(fpin, n, nnz, read_row_ptr, read_col_ind, csr_read_val), 0);
+		fclose(fpin);
+		std::vector<int> row_ind;
+		offsetsToIndices(read_row_ptr, row_ind);
+
+		std::vector<int> csr_mask(nnz, 1);
+
+		if (param.useMask) {
+			//Generating a mask
+			//Should be improved
+			for (int i = 0; i < nnz; i += 2)
+				csr_mask[i] = 0;
+		}
+
+		if (!enough_device_memory(n, nnz, sizeof(int) * (read_row_ptr.size() + read_col_ind.size())))
+													{
+			std::cout << "[  WAIVED  ] " << test_info->test_case_name() << "." << test_info->name()
+					<< std::endl;
+			return;
+		}
+
+		nvgraphGraphDescr_t g1 = NULL;
+		status = nvgraphCreateGraphDescr(handle, &g1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		// set up graph
+		int32_t blockN = std::max(2,(int)ceil(sqrt(numDevices)));
+		nvgraph2dCOOTopology32I_st topology = { n, nnz, &row_ind[0], &read_col_ind[0], CUDA_R_32I,
+		NULL, numDevices, devices, blockN, NVGRAPH_DEFAULT };
+		status = nvgraphSetGraphStructure(handle, g1, (void*) &topology, topo);
+
+		// set up graph data
+		std::vector<int> calculated_distances_res(n);
+		std::vector<int> calculated_predecessors_res(n);
+
+//		if (param.useMask) {
+//			status = nvgraphAllocateEdgeData(handle, g1, numsets_e, type_e);
+//			ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+//		}
+
+		int source_vert = param.source_vert;
+
+		if (param.useMask) {
+			//if we need to use a mask
+			//Copying mask into graph
+
+			//status = nvgraphSetEdgeData(handle, g1, &csr_mask[0], 0);
+			//ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+			//nvgraphTraversalSetEdgeMaskIndex(&traversal_param, 0);
+		}
+
+		status = nvgraph2dBfs(handle,
+										g1,
+										source_vert,
+										&calculated_distances_res[0],
+										&calculated_predecessors_res[0]);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		cudaDeviceSynchronize();
+
+		if (PERF && n > PERF_ROWS_LIMIT)
+		{
+			double start, stop;
+			start = second();
+			int repeat = 30;
+			for (int i = 0; i < repeat; i++)
+					{
+				status = nvgraph2dBfs(handle,
+												g1,
+												source_vert,
+												&calculated_distances_res[0],
+												&calculated_predecessors_res[0]);
+				ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+			}
+			cudaDeviceSynchronize();
+			stop = second();
+			printf("&&&& PERF Time_%s %10.8f -ms\n",
+						test_id.c_str(),
+						1000.0 * (stop - start) / repeat);
+		}
+
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		// check with reference
+		std::vector<int> expected_distances_res(n);
+		ref_bfs(n,
+					nnz,
+					&read_row_ptr[0],
+					&read_col_ind[0],
+					&csr_mask[0],
+					source_vert,
+					&expected_distances_res[0]);
+		//Checking distances
+//		int wrong = 0;
+//		for (int i = 0; i < n; i++) {
+//			if (expected_distances_res[i] != calculated_distances_res[i]) {
+//				wrong++;
+//				std::cout << "Error at " << i << " expected " << expected_distances_res[i] << " actual "
+//						<< calculated_distances_res[i] << "\n";
+//			}
+//		}
+//		std::cout << wrong << "/" << n << " distances are incorrect.\n";
+		for (int i = 0; i < n; ++i)
+				{
+			ASSERT_EQ(expected_distances_res[i], calculated_distances_res[i])<< "Wrong distance from source in row #" << i << " graph " << param.graph_file << " source_vert=" << source_vert<< "\n";
+		}
+
+		//Checking predecessors
+		for (int i = 0; i < n; ++i) {
+			if (calculated_predecessors_res[i] != -1) {
+				ASSERT_EQ(expected_distances_res[i], expected_distances_res[calculated_predecessors_res[i]] + 1)<< "Wrong predecessor in row #" << i << " graph " << param.graph_file << " source_vert=" << source_vert<< "\n";
+			} else {
+				ASSERT_TRUE(expected_distances_res[i] == 0 || expected_distances_res[i] == -1) << "Wrong predecessor in row #" << i << " graph " << param.graph_file << " source_vert=" << source_vert<< "\n";
+			}
+		}
+
+		status = nvgraphDestroyGraphDescr(handle, g1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	}
+};
+
+TEST_P(NVGraphCAPITests_2d_bfs, CheckResult) {
+	run_current_test<float>(GetParam());
+}			/// Few sanity checks.
+
+class NVGraphCAPITests_2d_bfs_Sanity: public ::testing::Test {
+public:
+	nvgraphStatus_t status;
+	nvgraphHandle_t handle;
+	nvgraphTopologyType_t topo;
+	int n;
+	int nnz;
+	nvgraphGraphDescr_t g1;
+	int32_t* devices;
+	int32_t numDevices;
+
+	NVGraphCAPITests_2d_bfs_Sanity() :
+			handle(NULL) {
+	}
+
+	static void SetupTestCase() {
+	}
+	static void TearDownTestCase() {
+	}
+	virtual void SetUp() {
+		topo = NVGRAPH_2D_32I_32I;
+		nvgraphStatus_t status;
+		if (handle == NULL) {
+			char* nvgraph_gpus = getenv("NVGRAPH_GPUS");
+			if (nvgraph_gpus)
+				printf("Value of NVGRAPH_GPUS=%s\n", nvgraph_gpus);
+			else
+				printf("Value of NVGRAPH_GPUS is null\n");
+			std::vector<int32_t> gpus;
+			int32_t dummy;
+			std::stringstream ss(nvgraph_gpus);
+			while (ss >> dummy) {
+				gpus.push_back(dummy);
+				if (ss.peek() == ',')
+					ss.ignore();
+			}
+			printf("There were %d devices found: ", (int) gpus.size());
+			for (int i = 0; i < gpus.size(); i++)
+				std::cout << gpus[i] << "  ";
+			std::cout << "\n";
+
+			devices = (int32_t*) malloc(sizeof(int32_t) * gpus.size());
+			for (int i = 0; i < gpus.size(); i++)
+				devices[i] = gpus[i];
+			numDevices = gpus.size();
+
+			status = nvgraphCreateMulti(&handle, numDevices, devices);
+			ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		}
+	}
+	virtual void TearDown() {
+		if (handle != NULL) {
+			status = nvgraphDestroy(handle);
+			ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+			handle = NULL;
+		}
+	}
+
+	template<typename EdgeT>
+	void prepare_and_run(nvgraph2dCOOTopology32I_st& topo_st, int* expected)
+								{
+		g1 = NULL;
+		status = nvgraphCreateGraphDescr(handle, &g1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		// set up graph
+		n = topo_st.nvertices;
+		nnz = topo_st.nedges;
+		status = nvgraphSetGraphStructure(handle, g1, (void*) &topo_st, topo);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		int source_vert = 0;
+
+		// Call BFS
+		std::vector<int> calculated_dist(n);
+		std::vector<int> calculated_pred(n);
+		status = nvgraph2dBfs(handle, g1, source_vert, &calculated_dist[0], &calculated_pred[0]);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		// Check results against reference implementation
+		for (int row = 0; row < n; row++) {
+			int reference_res = (int) expected[row];
+			int nvgraph_res = (int) calculated_dist[row];
+			ASSERT_EQ(reference_res, nvgraph_res);
+		}
+
+		status = nvgraphDestroyGraphDescr(handle, g1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	}
+
+// cycle graph, shortest path = vertex number
+	template<typename EdgeT>
+	void run_cycle_test()
+	{
+		n = 1024;
+		nnz = n;
+		std::vector<int> offsets(n + 1), neighborhood(n);
+		for (int i = 0; i < n; i++) {
+			offsets[i] = i;
+			neighborhood[i] = (i + 1) % n;
+		}
+		offsets[n] = n;
+		std::vector<int> expected_res(n, nvgraph_Const<int>::inf);
+		for (int i = 0; i < n; i++) {
+			expected_res[i] = i;
+		}
+		int32_t blockN = std::max(2,(int)ceil(sqrt(numDevices)));
+		nvgraph2dCOOTopology32I_st topology = { n, nnz, &offsets[0], &neighborhood[0], CUDA_R_32I,
+		NULL, numDevices, devices, blockN, NVGRAPH_DEFAULT };
+
+		prepare_and_run<EdgeT>(topology, &expected_res[0]);
+		free(devices);
+	}
+
+	template<typename EdgeT>
+	void run_cycle_test_undirected()
+	{
+		n = 16;
+		nnz = n * 2;
+		std::vector<int> offsets(n + 1), neighborhood(nnz);
+		for (int i = 0; i < n; i++) {
+			offsets[i] = i * 2;
+			neighborhood[i * 2] = (i - 1 + n) % n;
+			neighborhood[i * 2 + 1] = (i + 1 + n) % n;
+		}
+		offsets[n] = nnz;
+		std::vector<int> expected_res(n, nvgraph_Const<int>::inf);
+		for (int i = 0; i < n; i++) {
+			expected_res[i] = i;
+		}
+		int32_t blockN = std::max(2,(int)ceil(sqrt(numDevices)));
+		nvgraph2dCOOTopology32I_st topology = { n, nnz, &offsets[0], &neighborhood[0], CUDA_R_32I,
+		NULL, numDevices, devices, blockN, NVGRAPH_DEFAULT };
+
+		prepare_and_run<EdgeT>(topology, &expected_res[0]);
+		free(devices);
+	}
+
+	template<typename EdgeT>
+	void run_block_skip_test() {
+		n = 10;
+		nnz = 4;
+		int rowIndices[4] = { 0, 1, 5, 6 };
+		int columnIndices[4] = { 1, 5, 6, 3 };
+		int expected[10] = { 0, 1, -1, 4, -1, 2, 3, -1, -1, -1 };
+		int32_t blockN = std::max(2,(int)ceil(sqrt(numDevices)));
+		nvgraph2dCOOTopology32I_st topology = { n, nnz, rowIndices, columnIndices, CUDA_R_32I,
+		NULL, numDevices, devices, blockN, NVGRAPH_DEFAULT };
+		prepare_and_run<EdgeT>(topology, expected);
+		free(devices);
+	}
+
+	template<typename EdgeT>
+	void run_multi_path_test() {
+		n = 10;
+		nnz = 6;
+		int rowIndices[6] = { 0, 0, 1, 5, 5, 6 };
+		int columnIndices[6] = { 1, 5, 6, 6, 9, 9 };
+		int expected[10] = { 0, 1, -1, -1, -1, 1, 2, -1, -1, 2 };
+		int32_t blockN = std::max(2,(int)ceil(sqrt(numDevices)));
+		nvgraph2dCOOTopology32I_st topology = { n, nnz, rowIndices, columnIndices, CUDA_R_32I,
+		NULL, numDevices, devices, blockN, NVGRAPH_DEFAULT };
+		prepare_and_run<EdgeT>(topology, expected);
+		free(devices);
+	}
+
+};
+
+TEST_F(NVGraphCAPITests_2d_bfs_Sanity, SanityCycle)
+{
+	run_cycle_test<float>();
+}
+
+TEST_F(NVGraphCAPITests_2d_bfs_Sanity, BlockSkip) {
+	run_block_skip_test<float>();
+}
+
+TEST_F(NVGraphCAPITests_2d_bfs_Sanity, MultiPath) {
+	run_multi_path_test<float>();
+}
+
+//	class NVGraphCAPITests_Traversal_Stress: public ::testing::TestWithParam<Traversal_Usecase> {
+//	public:
+//		NVGraphCAPITests_Traversal_Stress() :
+//		handle(NULL) {
+//		}
+//
+//		static void SetupTestCase() {
+//		}
+//		static void TearDownTestCase() {
+//		}
+//		virtual void SetUp() {
+//			//const ::testing::TestInfo* const test_info =::testing::UnitTest::GetInstance()->current_test_info();
+//			//printf("We are in test %s of test case %s.\n", test_info->name(), test_info->test_case_name());
+//			if (handle == NULL) {
+//				int* devices = (int*)malloc(sizeof(int) * 2);
+//				devices[0] = 0;
+//				devices[1] = 1;
+//				status = nvgraphCreateMulti(&handle, 2, devices);
+//				ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+//				free(devices);
+//			}
+//		}
+//		virtual void TearDown() {
+//			if (handle != NULL) {
+//				status = nvgraphDestroy(handle);
+//				ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+//				handle = NULL;
+//			}
+//		}
+//		nvgraphStatus_t status;
+//		nvgraphHandle_t handle;
+//
+//		template<typename EdgeT>
+//		void run_current_test(const Traversal_Usecase& param)
+//		{
+//			nvgraphTopologyType_t topo = NVGRAPH_2D_32I_32I;
+//
+//			nvgraphStatus_t status;
+//
+//			FILE* fpin = fopen(param.graph_file.c_str(), "rb");
+//			ASSERT_TRUE(fpin != NULL)<< "Cannot read input graph file: " << param.graph_file << std::endl;
+//			int n, nnz;
+//			//Read a network in amgx binary format and the bookmark of dangling nodes
+//			ASSERT_EQ(read_header_amgx_csr_bin(fpin, n, nnz), 0);
+//			std::vector<int> read_row_ptr(n + 1), read_col_ind(nnz);
+//			std::vector<EdgeT> read_val(nnz);
+//			ASSERT_EQ(read_data_amgx_csr_bin(fpin, n, nnz, read_row_ptr, read_col_ind, read_val), 0);
+//			fclose(fpin);
+//
+//			nvgraphGraphDescr_t g1 = NULL;
+//			status = nvgraphCreateGraphDescr(handle, &g1);
+//			ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+//
+//			// set up graph
+//			int* devices = (int*)malloc(sizeof(int) * 2);
+//			devices[0] = 0;
+//			devices[1] = 1;
+//			nvgraph2dCOOTopology32I_st topology = {n, nnz, &read_row_ptr[0], &read_col_ind[0], CUDA_R_32I, NULL, 2, devices, 2, NVGRAPH_DEFAULT};
+//			status = nvgraphSetGraphStructure(handle, g1, (void*) &topology, topo);
+//			free(devices);
+//
+//			std::vector<int> calculated_res(n);
+//			int source_vert = param.source_vert;
+//
+//			// run
+//			int repeat = 2;//std::max((int)(((float)(Traversal_ITER_MULTIPLIER)*STRESS_MULTIPLIER)/(3*n)), 1);
+//
+//			std::vector<int> calculated_res1(n), calculated_res_mid1(n), calculated_res_last(n);
+//			std::vector<int> calculated_res2(n), calculated_res_mid2(n);
+//			size_t free_mid = 0, free_last = 0, total = 0;
+//			for (int i = 0; i < repeat; i++)	{
+//				status = nvgraphTraversal( handle,
+//						g1,
+//						NVGRAPH_TRAVERSAL_BFS,
+//						&source_vert,
+//						traversal_param);
+//				ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+//
+//				// all of those should be equal
+//				if (i == 0)
+//				{
+//					status = nvgraphGetVertexData(handle,
+//							g1,
+//							(void *) &calculated_res1[0],
+//							traversal_distances_index);
+//					ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+//					status = nvgraphGetVertexData(handle,
+//							g1,
+//							(void *) &calculated_res2[0],
+//							traversal_predecessors_index);
+//					ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+//				}
+//				else
+//				{
+//					status = nvgraphGetVertexData(handle,
+//							g1,
+//							(void *) &calculated_res_mid1[0],
+//							traversal_distances_index);
+//					ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+//					status = nvgraphGetVertexData(handle,
+//							g1,
+//							(void *) &calculated_res_mid2[0],
+//							traversal_predecessors_index);
+//					ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+//
+//					for (int row = 0; row < n; row++)
+//					{
+//						ASSERT_EQ(calculated_res1[row], calculated_res_mid1[row])<< "Difference in result in distances for row #" << row << " graph " << param.graph_file << " for iterations #0 and iteration #" << i;
+//						// predecessors could be different since multiple shortest paths are possible
+//						//ASSERT_EQ(calculated_res2[row], calculated_res_mid2[row]) << "Difference in result in predecessors for row #" << row << " graph " << param.graph_file << " for iterations #0 and iteration #" <<  i;
+//					}
+//				}
+//
+//				if (i == std::min(50, (int) (repeat / 2)))
+//				{
+//					cudaMemGetInfo(&free_mid, &total);
+//				}
+//				if (i == repeat - 1)
+//				{
+//					status = nvgraphGetVertexData(handle,
+//							g1,
+//							(void *) &calculated_res_last[0],
+//							traversal_distances_index);
+//					ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+//					cudaMemGetInfo(&free_last, &total);
+//				}
+//			}
+//
+//			ASSERT_LE(free_mid, free_last)<< "Memory difference between iteration #" << std::min(50, (int)(repeat/2)) << " and last iteration is " << (double)(free_last-free_mid)/1e+6 << "MB";
+//
+//			status = nvgraphDestroyGraphDescr(handle, g1);
+//			ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+//		}
+//	};
+//
+//	TEST_P(NVGraphCAPITests_Traversal_Stress, Stress)
+//	{
+//		run_current_test<float>(GetParam());
+//	}
+
+// instatiation of the performance/correctness checks 
+
+INSTANTIATE_TEST_CASE_P(CorrectnessCheck,
+								NVGraphCAPITests_2d_bfs,
+								//                                  graph FILE                                                  source vert #    file with expected result (in binary?)
+								::testing::Values(
+								Traversal_Usecase("graphs/cage/cage13_T.mtx.bin", 0)
+								, Traversal_Usecase("graphs/cage/cage13_T.mtx.bin", 10)
+								, Traversal_Usecase("graphs/cage/cage14_T.mtx.bin", 0)
+								, Traversal_Usecase("graphs/cage/cage14_T.mtx.bin", 10)
+								, Traversal_Usecase("graphs/small/small.bin", 0)
+								, Traversal_Usecase("graphs/small/small.bin", 0)
+								, Traversal_Usecase("graphs/small/small.bin", 3)
+								, Traversal_Usecase("graphs/dblp/dblp.bin", 0, false, true)
+								, Traversal_Usecase("graphs/dblp/dblp.bin", 100, false, true)
+								, Traversal_Usecase("graphs/dblp/dblp.bin", 1000, false, true)
+								, Traversal_Usecase("graphs/dblp/dblp.bin", 100000, false, true)
+								, Traversal_Usecase("graphs/Wikipedia/2003/wiki2003.bin", 0)
+								, Traversal_Usecase("graphs/Wikipedia/2003/wiki2003.bin", 100)
+								, Traversal_Usecase("graphs/Wikipedia/2003/wiki2003.bin", 10000)
+								, Traversal_Usecase("graphs/Wikipedia/2003/wiki2003.bin", 100000)
+								, Traversal_Usecase("graphs/Wikipedia/2011/wiki2011.bin", 1)
+								, Traversal_Usecase("graphs/Wikipedia/2011/wiki2011.bin", 1000)
+								, Traversal_Usecase("dimacs10/road_usa_T.mtx.bin", 100)
+								, Traversal_Usecase("graphs/Twitter/twitter.bin", 0)
+								, Traversal_Usecase("graphs/Twitter/twitter.bin", 100)
+								, Traversal_Usecase("graphs/Twitter/twitter.bin", 10000)
+								, Traversal_Usecase("graphs/Twitter/twitter.bin", 3000000)
+								, Traversal_Usecase("dimacs10/hugebubbles-00020_T.mtx.bin", 100000)
+								//					/// instances using mask
+//					, Traversal_Usecase("graphs/small/small.bin", 0, true)
+//					, Traversal_Usecase("graphs/small/small.bin", 0, true)
+//					, Traversal_Usecase("graphs/small/small.bin", 3, true)
+//					, Traversal_Usecase("graphs/dblp/dblp.bin", 0, true)
+//					, Traversal_Usecase("graphs/dblp/dblp.bin", 100, true)
+//					, Traversal_Usecase("graphs/dblp/dblp.bin", 1000, true)
+//					, Traversal_Usecase("graphs/dblp/dblp.bin", 100000, true)
+//					, Traversal_Usecase("graphs/Wikipedia/2003/wiki2003.bin", 0, true)
+								)
+
+								);
+
+//	INSTANTIATE_TEST_CASE_P(StressTest,
+//			NVGraphCAPITests_Traversal_Stress,
+//			::testing::Values(
+//					Traversal_Usecase("graphs/Wikipedia/2003/wiki2003.bin", 0)
+//			)
+//	);
+
+int main(int argc, char **argv)
+			{
+
+	for (int i = 0; i < argc; i++)
+			{
+		if (strcmp(argv[i], "--perf") == 0)
+			PERF = 1;
+		if (strcmp(argv[i], "--stress-iters") == 0)
+			STRESS_MULTIPLIER = atoi(argv[i + 1]);
+		if (strcmp(argv[i], "--ref-data-dir") == 0)
+			ref_data_prefix = std::string(argv[i + 1]);
+		if (strcmp(argv[i], "--graph-data-dir") == 0)
+			graph_data_prefix = std::string(argv[i + 1]);
+	}
+	srand(42);
+	::testing::InitGoogleTest(&argc, argv);
+
+	return RUN_ALL_TESTS();
+}
diff --git a/cpp/nvgraph/cpp/tests/nvgraph_capi_tests_2d_bfs_net.cpp b/cpp/nvgraph/cpp/tests/nvgraph_capi_tests_2d_bfs_net.cpp
new file mode 100644
index 00000000000..bc5bd303c2c
--- /dev/null
+++ b/cpp/nvgraph/cpp/tests/nvgraph_capi_tests_2d_bfs_net.cpp
@@ -0,0 +1,263 @@
+// This is gtest application that contains all of the C API tests. Parameters:
+// nvgraph_capi_tests [--perf] [--stress-iters N] [--gtest_filter=NameFilterPatter]
+// It also accepts any other gtest (1.7.0) default parameters.
+// Right now this application contains:
+// 1) Sanity Check tests - tests on simple examples with known answer (or known behaviour)
+// 2) Correctness checks tests - tests on real graph data, uses reference algorithm
+//    (CPU code for SrSPMV and python scripts for other algorithms, see
+//    python scripts here: //sw/gpgpu/nvgraph/test/ref/) with reference results, compares those two.
+//    It also measures performance of single algorithm C API call, enf enabled (see below)
+// 3) Corner cases tests - tests with some bad inputs, bad parameters, expects library to handle
+//    it gracefully
+// 4) Stress tests - makes sure that library result is persistent throughout the library usage
+//    (a lot of C API calls). Also makes some assumptions and checks on memory usage during
+//    this test.
+//
+// We can control what tests to launch by using gtest filters. For example:
+// Only sanity tests:
+//    ./nvgraph_capi_tests_traversal --gtest_filter=*Sanity*
+// And, correspondingly:
+//    ./nvgraph_capi_tests_traversal --gtest_filter=*Correctness*
+//    ./nvgraph_capi_tests_traversal --gtest_filter=*Corner*
+//    ./nvgraph_capi_tests_traversal --gtest_filter=*Stress*
+// Or, combination:
+//    ./nvgraph_capi_tests_traversal --gtest_filter=*Sanity*:*Correctness*
+//
+// Performance reports are provided in the ERIS format and disabled by default.
+// Could be enabled by adding '--perf' to the command line. I added this parameter to vlct
+//
+// Parameter '--stress-iters N', which gives multiplier (not an absolute value) for the number of launches for stress tests
+//
+
+#include <utility>
+
+#include "gtest/gtest.h"
+
+#include "nvgraph_test_common.h"
+
+#include "valued_csr_graph.hxx"
+#include "readMatrix.hxx"
+#include "nvgraphP.h"
+#include "nvgraph.h"
+#include <nvgraph_experimental.h>  // experimental header, contains hidden API entries, can be shared only under special circumstances without reveling internal things
+
+#include "stdlib.h"
+#include <algorithm>
+#include <numeric>
+#include <queue>
+#include <sstream>
+#include <cstdint>
+#include <math.h>
+#include "cuda_profiler_api.h"
+
+// do the perf measurements, enabled by command line parameter '--perf'
+static int PERF = 0;
+
+// minimum vertices in the graph to perform perf measurements
+#define PERF_ROWS_LIMIT 10000
+
+// number of repeats = multiplier/num_vertices
+#define Traversal_ITER_MULTIPLIER     30000000
+
+template<typename T>
+struct nvgraph_Const;
+
+template<>
+struct nvgraph_Const<int>
+{
+	static const cudaDataType_t Type = CUDA_R_32I;
+	static const int inf;
+};
+const int nvgraph_Const<int>::inf = INT_MAX;
+
+typedef struct Traversal_Usecase_t
+{
+	std::string graph_file;
+	int source_vert;
+	size_t n;
+	size_t nnz;
+	bool useMask;
+	bool undirected;
+
+	Traversal_Usecase_t(const std::string& a,
+	                    int source,
+	                    size_t _n,
+	                    size_t _nnz,
+	                    bool _useMask = false,
+	                    bool _undirected = false) :
+			source_vert(source), n(_n), nnz(_nnz), useMask(_useMask), undirected(_undirected) {
+		graph_file = a;
+	};
+
+	Traversal_Usecase_t& operator=(const Traversal_Usecase_t& rhs)
+												{
+		graph_file = rhs.graph_file;
+		source_vert = rhs.source_vert;
+		n = rhs.n;
+		nnz = rhs.nnz;
+		useMask = rhs.useMask;
+		return *this;
+	}
+} Traversal_Usecase;
+
+//// Traversal tests
+
+class NVGraphCAPITests_2d_bfs: public ::testing::TestWithParam<Traversal_Usecase> {
+public:
+	NVGraphCAPITests_2d_bfs() :
+			handle(NULL) {
+	}
+
+	static void SetupTestCase() {
+	}
+	static void TearDownTestCase() {
+	}
+	virtual void SetUp() {
+		if (handle == NULL) {
+			char* nvgraph_gpus = getenv("NVGRAPH_GPUS");
+			if (nvgraph_gpus)
+				printf("Value of NVGRAPH_GPUS=%s\n", nvgraph_gpus);
+			else
+				printf("Value of NVGRAPH_GPUS is null\n");
+			std::vector<int32_t> gpus;
+			int32_t dummy;
+			std::stringstream ss(nvgraph_gpus);
+			while (ss >> dummy){
+				gpus.push_back(dummy);
+				if (ss.peek() == ',')
+					ss.ignore();
+			}
+			printf("There were %d devices found: ", (int)gpus.size());
+			for (int i = 0; i < gpus.size(); i++)
+				std::cout << gpus[i] << "  ";
+			std::cout << "\n";
+
+			devices = (int32_t*) malloc(sizeof(int32_t) * gpus.size());
+			for (int i = 0; i < gpus.size(); i++)
+				devices[i] = gpus[i];
+			numDevices = gpus.size();
+
+			status = nvgraphCreateMulti(&handle, numDevices, devices);
+			ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		}
+	}
+	virtual void TearDown() {
+		if (handle != NULL) {
+			status = nvgraphDestroy(handle);
+			ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+			handle = NULL;
+			if (devices)
+				free(devices);
+		}
+	}
+	nvgraphStatus_t status;
+	nvgraphHandle_t handle;
+	int32_t *devices;
+	int32_t numDevices;
+
+	template<typename EdgeT>
+	void run_current_test(const Traversal_Usecase& param) {
+		const ::testing::TestInfo* const test_info =
+				::testing::UnitTest::GetInstance()->current_test_info();
+		std::stringstream ss;
+		ss << param.source_vert;
+		std::string test_id = std::string(test_info->test_case_name()) + std::string(".")
+				+ std::string(test_info->name()) + std::string("_") + getFileName(param.graph_file)
+				+ std::string("_") + ss.str().c_str();
+
+		nvgraphTopologyType_t topo = NVGRAPH_2D_32I_32I;
+		nvgraphStatus_t status;
+
+		// Read in graph from network file
+		std::vector<int32_t> sources;
+		std::vector<int32_t> destinations;
+		readNetworkFile(param.graph_file.c_str(), param.nnz, sources, destinations);
+
+		// Create graph handle
+		nvgraphGraphDescr_t g1 = NULL;
+		status = nvgraphCreateGraphDescr(handle, &g1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		// set up graph
+		int n = param.n;
+		int nnz = param.nnz;
+		int blockN = std::max(2,(int)ceil(sqrt(numDevices)));
+		std::cout << "Using " << blockN << " as block N\n";
+
+		nvgraph2dCOOTopology32I_st topology = { n, nnz, &sources[0], &destinations[0], CUDA_R_32I,
+		NULL, blockN, devices, numDevices, NVGRAPH_DEFAULT };
+		status = nvgraphSetGraphStructure(handle, g1, (void*) &topology, topo);
+
+		// set up graph data
+		std::vector<int> calculated_distances_res(n);
+		std::vector<int> calculated_predecessors_res(n);
+
+		int source_vert = param.source_vert;
+		std::cout << "Starting from vertex: " << source_vert << "\n";
+		cudaProfilerStart();
+		status = nvgraph2dBfs(handle,
+										g1,
+										source_vert,
+										&calculated_distances_res[0],
+										&calculated_predecessors_res[0]);
+		cudaProfilerStop();
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		cudaDeviceSynchronize();
+
+		if (PERF && n > PERF_ROWS_LIMIT)	{
+			double start, stop;
+			start = second();
+			int repeat = 30;
+			for (int i = 0; i < repeat; i++) {
+				status = nvgraph2dBfs(handle,
+												g1,
+												source_vert,
+												&calculated_distances_res[0],
+												&calculated_predecessors_res[0]);
+				ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+			}
+			cudaDeviceSynchronize();
+			stop = second();
+			printf("&&&& PERF Time_%s %10.8f -ms\n",
+						test_id.c_str(),
+						1000.0 * (stop - start) / repeat);
+		}
+
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		//Checking distances
+		int visitedCount = 0;
+		for (int i = 0; i < n; ++i) {
+			if (calculated_distances_res[i] != -1)
+				visitedCount++;
+		}
+		std::cout << "There were " << visitedCount << " vertices visited.\n";
+
+		status = nvgraphDestroyGraphDescr(handle, g1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	}
+};
+
+TEST_P(NVGraphCAPITests_2d_bfs, CheckResult) {
+	run_current_test<float>(GetParam());
+}
+
+INSTANTIATE_TEST_CASE_P(CorrectnessCheck,
+								NVGraphCAPITests_2d_bfs,
+								::testing::Values(
+										Traversal_Usecase("/mnt/nvgraph_test_data/Rmat100Mvertices2Bedges.net", 3, 100000000, 2000000000)
+								));
+
+int main(int argc, char **argv)
+			{
+
+	for (int i = 0; i < argc; i++)
+			{
+		if (strcmp(argv[i], "--perf") == 0)
+			PERF = 1;
+	}
+	srand(42);
+	::testing::InitGoogleTest(&argc, argv);
+
+	return RUN_ALL_TESTS();
+}
diff --git a/cpp/nvgraph/cpp/tests/nvgraph_capi_tests_algorithms.cpp b/cpp/nvgraph/cpp/tests/nvgraph_capi_tests_algorithms.cpp
new file mode 100644
index 00000000000..9299e5171b1
--- /dev/null
+++ b/cpp/nvgraph/cpp/tests/nvgraph_capi_tests_algorithms.cpp
@@ -0,0 +1,3168 @@
+// This is gtest application that contains all of the C API tests. Parameters:
+// nvgraph_capi_tests [--perf] [--stress-iters N] [--gtest_filter=NameFilterPatter]
+// It also accepts any other gtest (1.7.0) default parameters.
+// Right now this application contains:
+// 1) Sanity Check tests - tests on simple examples with known answer (or known behaviour)
+// 2) Correctness checks tests - tests on real graph data, uses reference algorithm 
+//    (CPU code for SrSPMV and python scripts for other algorithms, see 
+//    python scripts here: //sw/gpgpu/nvgraph/test/ref/) with reference results, compares those two.
+//    It also measures performance of single algorithm C API call, enf enabled (see below)
+// 3) Corner cases tests - tests with some bad inputs, bad parameters, expects library to handle 
+//    it gracefully
+// 4) Stress tests - makes sure that library result is persistent throughout the library usage
+//    (a lot of C API calls). Also makes some assumptions and checks on memory usage during 
+//    this test.
+//
+// We can control what tests to launch by using gtest filters. For example:
+// Only sanity tests:
+//    ./nvgraph_capi_tests --gtest_filter=*Sanity*
+// And, correspondingly:
+//    ./nvgraph_capi_tests --gtest_filter=*Correctness*
+//    ./nvgraph_capi_tests --gtest_filter=*Corner*
+//    ./nvgraph_capi_tests --gtest_filter=*Stress*
+// Or, combination:
+//    ./nvgraph_capi_tests --gtest_filter=*Sanity*:*Correctness*
+//
+// Performance reports are provided in the ERIS format and disabled by default. 
+// Could be enabled by adding '--perf' to the command line. I added this parameter to vlct
+//
+// Parameter '--stress-iters N', which gives multiplier (not an absolute value) for the number of launches for stress tests
+//
+
+#include <utility>
+
+#include "gtest/gtest.h"
+
+#include "nvgraph_test_common.h"
+
+#include "valued_csr_graph.hxx"
+#include "readMatrix.hxx"
+#include "nvgraphP.h"
+#include "nvgraph.h"
+#include <nvgraph_experimental.h>  // experimental header, contains hidden API entries, can be shared only under special circumstances without reveling internal things
+
+#include "stdlib.h"
+#include <algorithm>
+
+// do the perf measurements, enabled by command line parameter '--perf'
+static int PERF = 0;
+
+// minimum vertices in the graph to perform perf measurements
+#define PERF_ROWS_LIMIT 10000
+
+// number of repeats = multiplier/num_vertices
+#define SRSPMV_ITER_MULTIPLIER   1000000000
+#define SSSP_ITER_MULTIPLIER     30000000
+#define WIDEST_ITER_MULTIPLIER   30000000
+#define PAGERANK_ITER_MULTIPLIER 300000000
+
+static std::string ref_data_prefix = "";
+static std::string graph_data_prefix = "";
+
+// iterations for stress tests = this multiplier * iterations for perf tests
+static int STRESS_MULTIPLIER = 1;
+static int simple_repeats = 50;
+static int complex_repeats = 20;
+static int print_test_timings = 1;
+
+// utility
+
+template <typename T>
+struct nvgraph_Const;
+
+template <>
+struct nvgraph_Const<double>
+{ 
+    static const cudaDataType_t Type = CUDA_R_64F;
+    static const double inf;
+    static const double tol;
+    typedef union fpint 
+    {
+        double f;
+        unsigned long u;
+    } fpint_st;
+};
+
+const double nvgraph_Const<double>::inf = DBL_MAX;
+const double nvgraph_Const<double>::tol = 1e-6; // this is what we use as a tolerance in the algorithms, more precision than this is useless for CPU reference comparison
+
+template <>
+struct nvgraph_Const<float>
+{ 
+    static const cudaDataType_t Type = CUDA_R_32F;
+    static const float inf;
+    static const float tol;
+
+    typedef union fpint 
+    {
+        float f;
+        unsigned u;
+    } fpint_st;
+
+};
+
+const float nvgraph_Const<float>::inf = FLT_MAX;
+const float nvgraph_Const<float>::tol = 1e-4;
+
+template <typename T>
+struct comparison
+{
+    bool operator() (T* lhs, T* rhs) {return (*lhs) < (*rhs);}
+};
+
+struct SR_OP
+{
+
+    const char*
+    get_name(nvgraphSemiring_t sr)
+    { 
+        const char* ret = "Unknown_SR";
+        switch (sr)
+        {
+            case NVGRAPH_PLUS_TIMES_SR:
+                ret = "PLUS_TIMES_SR";
+                break;
+            case NVGRAPH_MIN_PLUS_SR:
+                ret = "MIN_PLUS_SR";
+                break;
+            case NVGRAPH_MAX_MIN_SR:
+                ret = "MAX_MIN_SR";
+                break;
+            case NVGRAPH_OR_AND_SR:
+                ret = "OR_AND_SR";
+                break;
+        }
+        return ret;
+    };
+
+
+    template <typename T>
+    T plus(const T& a, const T& b, nvgraphSemiring_t sr)
+    { 
+        T ret = (T)0;
+        switch (sr)
+        {
+            case NVGRAPH_PLUS_TIMES_SR:
+                ret = a + b;
+                break;
+            case NVGRAPH_MIN_PLUS_SR:
+                ret = std::min(a, b);
+                break;
+            case NVGRAPH_MAX_MIN_SR:
+                ret = std::max(a, b);
+                break;
+            case NVGRAPH_OR_AND_SR:
+                ret = (T)((bool)(a) | (bool)(b));
+                break;
+            default:
+                printf("Semiring %d is not supported, check line %d\n", (int)sr, __LINE__);
+                //FAIL() << "Semiring #" << (int)sr << " is not supported.";
+        }
+        return ret;
+    };
+
+    template <typename T>
+    T mul(const T& a, const T& b, nvgraphSemiring_t sr)
+    { 
+        T ret = (T)0;
+        switch (sr)
+        {
+            case NVGRAPH_PLUS_TIMES_SR:
+                ret = a * b;
+                break;
+            case NVGRAPH_MIN_PLUS_SR:
+                ret = a + b;
+                break;
+            case NVGRAPH_MAX_MIN_SR:
+                ret = std::min(a, b);;
+                break;
+            case NVGRAPH_OR_AND_SR:
+                ret = (T)((bool)(a) & (bool)(b));
+                break;
+            default:
+                printf("Semiring %d is not supported, check line %d\n", (int)sr, __LINE__);
+                //FAIL() << "Semiring #" << (int)sr << " is not supported.";
+        }
+        return ret;
+    };
+
+    template <typename T>
+    T get_ini(const nvgraphSemiring_t& sr) 
+    {
+        T ret = (T)0;
+        switch (sr)
+        {
+            case NVGRAPH_PLUS_TIMES_SR:
+                ret = (T)0;
+                break;
+            case NVGRAPH_MIN_PLUS_SR:
+                ret = nvgraph_Const<T>::inf;
+                break;
+            case NVGRAPH_MAX_MIN_SR:
+                ret = -(nvgraph_Const<T>::inf);
+                break;
+            case NVGRAPH_OR_AND_SR:
+                ret = (T)0;
+                break;
+            default:
+                printf("Semiring %d is not supported, check line %d\n", (int)sr, __LINE__);
+                //FAIL() << "Semiring #" << (int)sr << " is not supported.";
+        }
+        return ret;
+    };
+
+} SR_OPS;
+
+
+template <typename T>
+bool enough_device_memory(int n, int nnz, size_t add)
+{
+    size_t mtotal, mfree;
+    cudaMemGetInfo(&mfree, &mtotal);
+    if (mfree > add + sizeof(T)*3*(n + nnz)) 
+        return true;
+    return false;
+}
+
+std::string convert_to_local_path(const std::string& in_file)
+{
+    std::string wstr = in_file;
+    if ((wstr != "dummy") & (wstr != ""))
+    {
+        std::string prefix;
+        if (graph_data_prefix.length() > 0)
+        {
+            prefix = graph_data_prefix;
+        }
+        else 
+        {
+#ifdef _WIN32
+            //prefix = "C:\\mnt\\eris\\test\\matrices_collection\\";
+            prefix = "Z:\\matrices_collection\\";
+            std::replace(wstr.begin(), wstr.end(), '/', '\\');
+#else
+            prefix = "/mnt/nvgraph_test_data/";
+#endif
+        }
+        wstr = prefix + wstr;
+    }
+    return wstr;
+}
+
+std::string convert_to_local_path_refdata(const std::string& in_file)
+{
+    std::string wstr = in_file;
+    if ((wstr != "dummy") & (wstr != ""))
+    {
+        std::string prefix;
+        if (ref_data_prefix.length() > 0)
+        {
+            prefix = ref_data_prefix;
+        }
+        else
+        {
+#ifdef _WIN32
+            //prefix = "C:\\mnt\\eris\\test\\ref_data\\";
+            prefix = "Z:\\ref_data\\";
+            std::replace(wstr.begin(), wstr.end(), '/', '\\');
+#else
+            prefix = "/mnt/nvgraph_test_data/ref_data/";
+#endif
+        }
+        wstr = prefix + wstr;
+    }
+    return wstr;
+}
+
+// SrSPMV tests
+
+typedef struct SrSPMV_Usecase_t
+{
+    std::string graph_file;
+    nvgraphSemiring_t sr;
+    double alpha;
+    double beta;
+    double tolerance_mul;
+    SrSPMV_Usecase_t(const std::string& a, nvgraphSemiring_t b, const double c, const double d, double tolerance_multiplier = 1.0) : sr(b), alpha(c), beta(d), tolerance_mul(tolerance_multiplier) { graph_file = convert_to_local_path(a);};
+    SrSPMV_Usecase_t& operator=(const SrSPMV_Usecase_t& rhs) 
+    {
+        graph_file = rhs.graph_file;
+        sr = rhs.sr;
+        alpha = rhs.alpha;
+        beta = rhs.beta;
+        return *this;
+    };
+} SrSPMV_Usecase;
+
+typedef struct SSSP_Usecase_t
+{
+    std::string graph_file;
+    int source_vert;
+    std::string result_file;
+    double tolerance_mul;
+    SSSP_Usecase_t(const std::string& a, int b, const std::string& c, double tolerance_multiplier = 1.0) : source_vert(b), tolerance_mul(tolerance_multiplier) { graph_file = convert_to_local_path(a); result_file = convert_to_local_path_refdata(c);};
+    SSSP_Usecase_t& operator=(const SSSP_Usecase_t& rhs)
+    {
+        graph_file = rhs.graph_file;
+        source_vert = rhs.source_vert; 
+        result_file = rhs.result_file;
+        return *this;
+    } 
+} SSSP_Usecase;
+
+typedef struct WidestPath_Usecase_t
+{
+    std::string graph_file;
+    int source_vert;
+    std::string result_file;
+    double tolerance_mul;
+    WidestPath_Usecase_t(const std::string& a, int b, const std::string& c, double tolerance_multiplier = 1.0) : source_vert(b), tolerance_mul(tolerance_multiplier) { graph_file = convert_to_local_path(a); result_file = convert_to_local_path_refdata(c);};
+    WidestPath_Usecase_t& operator=(const WidestPath_Usecase_t& rhs)
+    {
+        graph_file = rhs.graph_file;
+        source_vert = rhs.source_vert;
+        result_file = rhs.result_file;
+        return *this;
+    }
+} WidestPath_Usecase;
+
+typedef struct Pagerank_Usecase_t
+{
+    std::string graph_file;
+    float alpha;
+    std::string result_file;
+    double tolerance_mul;
+    Pagerank_Usecase_t(const std::string& a, float b, const std::string& c, double tolerance_multiplier = 1.0) : alpha(b), tolerance_mul(tolerance_multiplier) { graph_file = convert_to_local_path(a); result_file = convert_to_local_path_refdata(c);};
+    Pagerank_Usecase_t& operator=(const Pagerank_Usecase_t& rhs)
+    {
+        graph_file = rhs.graph_file;
+        alpha = rhs.alpha; 
+        result_file = rhs.result_file;
+        return *this;  
+    } 
+} Pagerank_Usecase;
+
+
+class NVGraphCAPITests_SrSPMV : public ::testing::TestWithParam<SrSPMV_Usecase> {
+  public:
+    NVGraphCAPITests_SrSPMV() : handle(NULL) {}
+
+    static void SetupTestCase() {}
+    static void TearDownTestCase() {}
+    virtual void SetUp() {
+        //const ::testing::TestInfo* const test_info =::testing::UnitTest::GetInstance()->current_test_info();
+        //printf("We are in test %s of test case %s.\n", test_info->name(), test_info->test_case_name());
+        if (handle == NULL) {
+            status = nvgraphCreate(&handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+    }
+    virtual void TearDown() {
+        if (handle != NULL) {
+            status = nvgraphDestroy(handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            handle = NULL;
+        }
+    }
+    nvgraphStatus_t status;
+    nvgraphHandle_t handle;
+
+    template <typename T>
+    void run_current_test(const SrSPMV_Usecase& param)
+    {
+        double test_start, test_end, read_start, read_end;
+        test_start = second();
+        const ::testing::TestInfo* const test_info =::testing::UnitTest::GetInstance()->current_test_info();
+        std::stringstream ss;
+        ss << "_alpha_" << (int)param.alpha << "_beta_" << (int)param.beta;
+        std::string test_id = std::string(test_info->test_case_name()) + std::string(".") + std::string(test_info->name()) + std::string("_") + getFileName(param.graph_file) + ss.str();
+
+        nvgraphTopologyType_t topo = NVGRAPH_CSR_32;
+        int weight_index = 0;
+        int x_index = 0;
+        int y_index = 1;
+        nvgraphStatus_t status;
+
+        read_start = second();
+        FILE* fpin = fopen(param.graph_file.c_str(),"rb");
+        ASSERT_TRUE(fpin != NULL) << "Cannot read input graph file: " << param.graph_file << std::endl;
+        int n, nnz;
+        //Read a transposed network in amgx binary format and the bookmark of dangling nodes
+        ASSERT_EQ(read_header_amgx_csr_bin (fpin, n, nnz), 0);
+        std::vector<int> read_row_ptr(n+1), read_col_ind(nnz);
+        std::vector<T> read_val(nnz);
+        ASSERT_EQ(read_data_amgx_csr_bin (fpin, n, nnz, read_row_ptr, read_col_ind, read_val), 0);
+        fclose(fpin);
+        read_end = second();
+
+        if (!enough_device_memory<T>(n, nnz, sizeof(int)*(read_row_ptr.size() + read_col_ind.size())) ||
+            (PERF && (n < PERF_ROWS_LIMIT || param.alpha + param.beta < 2)))
+        {
+            std::cout << "[  WAIVED  ] " << test_info->test_case_name() << "." << test_info->name() << std::endl;
+            return;
+        }
+
+        nvgraphGraphDescr_t g1 = NULL;
+        status = nvgraphCreateGraphDescr(handle, &g1);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // set up graph
+        nvgraphCSCTopology32I_st topology = {n, nnz, &read_row_ptr[0], &read_col_ind[0]};
+        status = nvgraphSetGraphStructure(handle, g1, (void*)&topology, topo);
+
+        // set up graph data
+        //@TODO: random fill?
+        std::vector<T> calculated_res(n);
+        std::vector<T> data1(n), data2(n);
+        for (int i = 0; i < n; i++)
+        {
+            data1[i] = (T)(1.0*rand()/RAND_MAX - 0.5);
+            data2[i] = (T)(1.0*rand()/RAND_MAX - 0.5);
+            //printf ("data1[%d]==%f, data2[%d]==%f\n", i, data1[i], i, data2[i]);
+        }
+        void*  vertexptr[2] = {(void*)&data1[0], (void*)&data2[0]};
+        cudaDataType_t type_v[2] = {nvgraph_Const<T>::Type, nvgraph_Const<T>::Type};
+        
+        void*  edgeptr[1] = {(void*)&read_val[0]};
+        cudaDataType_t type_e[1] = {nvgraph_Const<T>::Type};
+
+        status = nvgraphAllocateVertexData(handle, g1, 2, type_v );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetVertexData(handle, g1, vertexptr[0], x_index );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetVertexData(handle, g1, vertexptr[1], y_index );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        status = nvgraphAllocateEdgeData(handle, g1, 1, type_e);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetEdgeData(handle, g1, (void *)edgeptr[0], weight_index );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        T alphaT = (T)param.alpha;
+        T betaT = (T)param.beta;
+
+        // run
+        if (PERF)
+        {
+            double start, stop;
+            // warmup
+            status = nvgraphSrSpmv(handle, g1, weight_index, (void*)&alphaT, x_index, (void*)&betaT, y_index, param.sr);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            cudaDeviceSynchronize();
+
+            int repeat = simple_repeats;
+            start = second();
+            start = second();
+            // perf loop
+            for (int i = 0; i < repeat; i++)
+            {
+                status = nvgraphSrSpmv(handle, g1, weight_index, (void*)&alphaT, x_index, (void*)&betaT, y_index, param.sr);
+                ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            }
+            cudaDeviceSynchronize();
+            stop = second();
+            printf("&&&& PERF Time_%s_%s %10.8f -ms\n", test_id.c_str(), SR_OPS.get_name(param.sr), 1000.0*(stop-start)/((double)repeat));
+        }
+
+        // reinit data
+        status = nvgraphSetVertexData(handle, g1, (void*)&data2[0], y_index);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSrSpmv(handle, g1, weight_index, (void*)&alphaT, x_index, (void*)&betaT, y_index, param.sr);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // get result
+        status = nvgraphGetVertexData(handle, g1, (void *)&calculated_res[0], y_index);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // check correctness 
+        std::vector<T> expected_res(n, SR_OPS.get_ini<T>(param.sr));
+        for (int row = 0; row < n; row++)
+        {
+            for (int nz = read_row_ptr[row]; nz < read_row_ptr[row+1]; nz++)
+            {
+                expected_res[row] = SR_OPS.plus<T>(expected_res[row], SR_OPS.mul<T>(SR_OPS.mul<T>(param.alpha, read_val[nz], param.sr), data1[read_col_ind[nz]], param.sr), param.sr);
+            }
+            expected_res[row] = SR_OPS.plus<T>(expected_res[row], SR_OPS.mul<T>(data2[row], param.beta, param.sr), param.sr);
+            double reference_res = (double)expected_res[row];
+            double nvgraph_res = (double)calculated_res[row];
+            ASSERT_NEAR(reference_res, nvgraph_res, nvgraph_Const<T>::tol) << "In row #" << row << " graph " << param.graph_file << " semiring " << SR_OPS.get_name(param.sr) << " alpha=" << param.alpha << " beta=" << param.beta << "\n";
+        }
+
+        status = nvgraphDestroyGraphDescr(handle, g1);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        test_end = second();
+        if (print_test_timings) printf("Test took: %f seconds from which %f seconds were spent on data reading\n", test_end - test_start, read_end - read_start);
+    }
+};
+ 
+TEST_P(NVGraphCAPITests_SrSPMV, CheckResultDouble)
+{
+    run_current_test<double>(GetParam());
+    
+}
+
+TEST_P(NVGraphCAPITests_SrSPMV, CheckResultFloat)
+{
+    run_current_test<float>(GetParam());
+}
+
+
+/// WidestPath tests
+
+class NVGraphCAPITests_WidestPath : public ::testing::TestWithParam<WidestPath_Usecase> {
+  public:
+    NVGraphCAPITests_WidestPath() : handle(NULL) {}
+
+    static void SetupTestCase() {}
+    static void TearDownTestCase() {}
+    virtual void SetUp() {
+        if (handle == NULL) {
+            status = nvgraphCreate(&handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+    }
+    virtual void TearDown() {
+        if (handle != NULL) {
+            status = nvgraphDestroy(handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            handle = NULL;
+        }
+    }
+    nvgraphStatus_t status;
+    nvgraphHandle_t handle;
+
+    template <typename T>
+    void run_current_test(const WidestPath_Usecase& param)
+    {
+        double test_start, test_end, read_start, read_end;
+        test_start = second();
+        const ::testing::TestInfo* const test_info =::testing::UnitTest::GetInstance()->current_test_info();
+        std::stringstream ss; 
+        ss << param.source_vert;
+        std::string test_id = std::string(test_info->test_case_name()) + std::string(".") + std::string(test_info->name()) + std::string("_") + getFileName(param.graph_file) + std::string("_") + ss.str().c_str();
+
+        nvgraphTopologyType_t topo = NVGRAPH_CSC_32;
+
+        nvgraphStatus_t status;
+
+        read_start = second();
+        FILE* fpin = fopen(param.graph_file.c_str(),"rb");
+        ASSERT_TRUE(fpin != NULL) << "Cannot read input graph file: " << param.graph_file << std::endl;
+        int n, nnz;
+        //Read a transposed network in amgx binary format and the bookmark of dangling nodes
+        ASSERT_EQ(read_header_amgx_csr_bin (fpin, n, nnz), 0);
+        std::vector<int> read_row_ptr(n+1), read_col_ind(nnz);
+        std::vector<T> read_val(nnz);
+        ASSERT_EQ(read_data_amgx_csr_bin (fpin, n, nnz, read_row_ptr, read_col_ind, read_val), 0);
+        fclose(fpin);
+        read_end = second();
+
+        if (!enough_device_memory<T>(n, nnz, sizeof(int)*(read_row_ptr.size() + read_col_ind.size())) ||
+            (PERF && n < PERF_ROWS_LIMIT))
+        {
+            std::cout << "[  WAIVED  ] " << test_info->test_case_name() << "." << test_info->name() << std::endl;
+            return;
+        }
+
+        nvgraphGraphDescr_t g1 = NULL;
+        status = nvgraphCreateGraphDescr(handle, &g1);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // set up graph
+        nvgraphCSCTopology32I_st topology = {n, nnz, &read_row_ptr[0], &read_col_ind[0]};
+        status = nvgraphSetGraphStructure(handle, g1, (void*)&topology, topo);
+
+        // set up graph data
+        size_t numsets = 1;
+        std::vector<T> calculated_res(n);
+        //void*  vertexptr[1] = {(void*)&calculated_res[0]};
+        cudaDataType_t type_v[1] = {nvgraph_Const<T>::Type};
+        
+        void*  edgeptr[1] = {(void*)&read_val[0]};
+        cudaDataType_t type_e[1] = {nvgraph_Const<T>::Type};
+
+        status = nvgraphAllocateVertexData(handle, g1, numsets, type_v);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        //status = nvgraphSetVertexData(handle, g1, vertexptr[0], 0 );
+        //ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphAllocateEdgeData(handle, g1, numsets, type_e );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetEdgeData(handle, g1, (void *)edgeptr[0], 0 );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        int weight_index = 0;
+        int source_vert = param.source_vert;
+        int widest_path_index = 0;
+
+        status = nvgraphWidestPath(handle, g1, weight_index, &source_vert, widest_path_index);
+        cudaDeviceSynchronize();
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        // run
+        if (PERF)
+        {
+            double start, stop;
+            start = second();
+            start = second();
+            int repeat = simple_repeats;
+            for (int i = 0; i < repeat; i++)
+            {
+                status = nvgraphWidestPath(handle, g1, weight_index, &source_vert, widest_path_index);
+                ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            }
+            cudaDeviceSynchronize();
+            stop = second();
+            printf("&&&& PERF Time_%s %10.8f -ms\n", test_id.c_str(), 1000.0*(stop-start)/repeat);
+        }
+
+        // get result
+        status = nvgraphGetVertexData(handle, g1, (void *)&calculated_res[0], 0);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // check correctness 
+        if (param.result_file.length()>0)
+        {
+            fpin = fopen(param.result_file.c_str(),"rb");
+            ASSERT_TRUE(fpin != NULL) << " Cannot read file with reference data: " << param.result_file << std::endl;
+            std::vector<T> expected_res(n);
+            ASSERT_EQ(read_binary_vector(fpin, n, expected_res), 0);
+            fclose(fpin);
+            for (int i = 0; i < n; i++)
+            {
+                ASSERT_NEAR(expected_res[i], calculated_res[i], nvgraph_Const<T>::tol) << "In row #" << i << " graph " << param.graph_file << " source_vert=" << source_vert<< "\n" ;
+            }            
+        }
+
+        status = nvgraphDestroyGraphDescr(handle, g1);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        test_end = second();
+        if (print_test_timings) printf("Test took: %f seconds from which %f seconds were spent on data reading\n", test_end - test_start, read_end - read_start);
+    }
+};
+ 
+TEST_P(NVGraphCAPITests_WidestPath, CheckResultDouble)
+{
+    run_current_test<double>(GetParam());
+    
+}
+
+TEST_P(NVGraphCAPITests_WidestPath, CheckResultFloat)
+{
+    run_current_test<float>(GetParam());
+}
+
+
+
+//// SSSP tests
+
+class NVGraphCAPITests_SSSP : public ::testing::TestWithParam<SSSP_Usecase> {
+  public:
+    NVGraphCAPITests_SSSP() : handle(NULL) {}
+
+    static void SetupTestCase() {}
+    static void TearDownTestCase() {}
+    virtual void SetUp() {
+        if (handle == NULL) {
+            status = nvgraphCreate(&handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+    }
+    virtual void TearDown() {
+        if (handle != NULL) {
+            status = nvgraphDestroy(handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            handle = NULL;
+        }
+    }
+    nvgraphStatus_t status;
+    nvgraphHandle_t handle;
+
+    template <typename T>
+    void run_current_test(const SSSP_Usecase& param)
+    {
+        double test_start, test_end, read_start, read_end;
+        test_start = second();
+        const ::testing::TestInfo* const test_info =::testing::UnitTest::GetInstance()->current_test_info();
+        std::stringstream ss; 
+        ss << param.source_vert;
+        std::string test_id = std::string(test_info->test_case_name()) + std::string(".") + std::string(test_info->name()) + std::string("_") + getFileName(param.graph_file) + std::string("_") + ss.str().c_str();
+
+        nvgraphTopologyType_t topo = NVGRAPH_CSC_32;
+
+        nvgraphStatus_t status;
+
+        read_start = second();
+        FILE* fpin = fopen(param.graph_file.c_str(),"rb");
+        ASSERT_TRUE(fpin != NULL) << "Cannot read input graph file: " << param.graph_file << std::endl;
+        int n, nnz;
+        //Read a transposed network in amgx binary format and the bookmark of dangling nodes
+        ASSERT_EQ(read_header_amgx_csr_bin (fpin, n, nnz), 0);
+        std::vector<int> read_row_ptr(n+1), read_col_ind(nnz);
+        std::vector<T> read_val(nnz);
+        ASSERT_EQ(read_data_amgx_csr_bin (fpin, n, nnz, read_row_ptr, read_col_ind, read_val), 0);
+        fclose(fpin);
+        read_end = second();
+
+        if (!enough_device_memory<T>(n, nnz, sizeof(int)*(read_row_ptr.size() + read_col_ind.size())) || 
+            (PERF && n < PERF_ROWS_LIMIT))
+        {
+            std::cout << "[  WAIVED  ] " << test_info->test_case_name() << "." << test_info->name() << std::endl;
+            return;
+        }
+
+        nvgraphGraphDescr_t g1 = NULL;
+        status = nvgraphCreateGraphDescr(handle, &g1);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // set up graph
+        nvgraphCSCTopology32I_st topology = {n, nnz, &read_row_ptr[0], &read_col_ind[0]};
+        status = nvgraphSetGraphStructure(handle, g1, (void*)&topology, topo);
+
+        // set up graph data
+        size_t numsets = 1;
+        std::vector<T> calculated_res(n);
+        //void*  vertexptr[1] = {(void*)&calculated_res[0]};
+        cudaDataType_t type_v[2] = {nvgraph_Const<T>::Type};
+        
+        void*  edgeptr[1] = {(void*)&read_val[0]};
+        cudaDataType_t type_e[1] = {nvgraph_Const<T>::Type};
+
+        status = nvgraphAllocateVertexData(handle, g1, numsets, type_v);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        //status = nvgraphSetVertexData(handle, descrG, vertexptr[0], 0 );
+        //ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphAllocateEdgeData(handle, g1, numsets, type_e);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetEdgeData(handle, g1, (void *)edgeptr[0], 0 );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        int weight_index = 0;
+        int source_vert = param.source_vert;
+        int sssp_index = 0;
+
+        // run
+        status = nvgraphSssp(handle, g1, weight_index, &source_vert, sssp_index);
+        cudaDeviceSynchronize();
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        if (PERF)
+        {
+            double start, stop;
+            start = second();
+            start = second();
+            int repeat = simple_repeats;
+            for (int i = 0; i < repeat; i++)
+            {
+                status = nvgraphSssp(handle, g1, weight_index, &source_vert, sssp_index);
+                ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            }
+            cudaDeviceSynchronize();
+            stop = second();
+            printf("&&&& PERF Time_%s %10.8f -ms\n", test_id.c_str(), 1000.0*(stop-start)/repeat);
+        }
+
+        // get result
+        status = nvgraphGetVertexData(handle, g1, (void *)&calculated_res[0], sssp_index);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // check with reference
+        if (param.result_file.length() > 0)
+        {
+            fpin = fopen(param.result_file.c_str(),"rb");
+            ASSERT_TRUE(fpin != NULL) << " Cannot read file with reference data: " << param.result_file << std::endl;
+            std::vector<T> expected_res(n);
+            ASSERT_EQ(read_binary_vector(fpin, n, expected_res), 0);
+            fclose(fpin);
+            for (int i = 0; i < n; i++)
+            {
+                ASSERT_NEAR(expected_res[i], calculated_res[i], nvgraph_Const<T>::tol) << "In row #" << i << " graph " << param.graph_file << " source_vert=" << source_vert<< "\n" ;
+            }
+        }
+
+        status = nvgraphDestroyGraphDescr(handle, g1);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        test_end = second();
+        if (print_test_timings) printf("Test took: %f seconds from which %f seconds were spent on data reading\n", test_end - test_start, read_end - read_start);
+    }
+};
+ 
+TEST_P(NVGraphCAPITests_SSSP, CheckResultDouble)
+{
+    run_current_test<double>(GetParam());   
+}
+
+TEST_P(NVGraphCAPITests_SSSP, CheckResultFloat)
+{
+    run_current_test<float>(GetParam());
+}
+
+class NVGraphCAPITests_Pagerank : public ::testing::TestWithParam<Pagerank_Usecase> {
+  public:
+    NVGraphCAPITests_Pagerank() : handle(NULL) {}
+
+    static void SetupTestCase() {}
+    static void TearDownTestCase() {}
+    virtual void SetUp() {
+        if (handle == NULL) {
+            status = nvgraphCreate(&handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+    }
+    virtual void TearDown() {
+        if (handle != NULL) {
+            status = nvgraphDestroy(handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            handle = NULL;
+        }
+    }
+    nvgraphStatus_t status;
+    nvgraphHandle_t handle;
+
+    template <typename T>
+    void run_current_test(const Pagerank_Usecase& param)
+    {
+        double test_start, test_end, read_start, read_end;
+        test_start = second();
+        const ::testing::TestInfo* const test_info =::testing::UnitTest::GetInstance()->current_test_info();
+        std::stringstream ss; 
+        ss << param.alpha;
+        std::string test_id = std::string(test_info->test_case_name()) + std::string(".") + std::string(test_info->name()) + std::string("_") + getFileName(param.graph_file) + std::string("_") + ss.str().c_str();
+
+        if (param.graph_file == "dummy")
+        {
+            std::cout << "[  WAIVED  ] " << test_info->test_case_name() << "." << test_info->name() << std::endl;
+            return;
+        }
+
+        // Waive hugebubbles test, http://nvbugs/200189611
+        /*{
+            cudaDeviceProp prop;
+            cudaGetDeviceProperties ( &prop, 0 );
+            std::string gpu(prop.name);
+            if (param.graph_file.find("hugebubbles-00020") != std::string::npos &&
+                (gpu.find("M40") != npos ||
+                 gpu.find("GTX 980 Ti") != npos ||
+                 gpu.find("GTX TITAN X") != npos ||
+                 gpu.find("M6000") != npos ||
+                 gpu.find("GTX 680") != npos)
+                )
+            std::cout << "[  WAIVED  ] " << test_info->test_case_name() << "." << test_info->name() << std::endl;
+            return;   
+        }*/
+
+        nvgraphTopologyType_t topo = NVGRAPH_CSC_32;
+
+        nvgraphStatus_t status;
+
+        read_start = second();
+        FILE* fpin = fopen(param.graph_file.c_str(),"rb");
+        ASSERT_TRUE(fpin != NULL) << "Cannot read input graph file: " << param.graph_file << std::endl;
+        int n, nnz;
+        //Read a transposed network in amgx binary format and the bookmark of dangling nodes
+        ASSERT_EQ(read_header_amgx_csr_bin (fpin, n, nnz), 0);
+        std::vector<int> read_row_ptr(n+1), read_col_ind(nnz);
+        std::vector<T> read_val(nnz);
+        std::vector<T> dangling(n);
+        ASSERT_EQ(read_data_amgx_csr_bin_rhs (fpin, n, nnz, read_row_ptr, read_col_ind, read_val, dangling), 0);
+        fclose(fpin);
+        read_end = second();
+
+        if (!enough_device_memory<T>(n, nnz, sizeof(int)*(read_row_ptr.size() + read_col_ind.size())) ||
+            (PERF && n < PERF_ROWS_LIMIT))
+        {
+            std::cout << "[  WAIVED  ] " << test_info->test_case_name() << "." << test_info->name() << std::endl;
+            return;
+        }
+
+        nvgraphGraphDescr_t g1 = NULL;
+        status = nvgraphCreateGraphDescr(handle, &g1);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // set up graph
+        nvgraphCSCTopology32I_st topology = {n, nnz, &read_row_ptr[0], &read_col_ind[0]};
+        status = nvgraphSetGraphStructure(handle, g1, (void*)&topology, topo);
+
+        // set up graph data
+        std::vector<T> calculated_res(n, (T)1.0/n);
+        void*  vertexptr[2] = {(void*)&dangling[0], (void*)&calculated_res[0]};
+        cudaDataType_t type_v[2] = {nvgraph_Const<T>::Type, nvgraph_Const<T>::Type};
+        
+        void*  edgeptr[1] = {(void*)&read_val[0]};
+        cudaDataType_t type_e[1] = {nvgraph_Const<T>::Type};
+
+        status = nvgraphAllocateVertexData(handle, g1, 2, type_v);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetVertexData(handle, g1, vertexptr[0], 0);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetVertexData(handle, g1, vertexptr[1], 1);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphAllocateEdgeData(handle, g1, 1, type_e );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetEdgeData(handle, g1, (void *)edgeptr[0], 0);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        
+        int bookmark_index = 0;
+        int weight_index = 0;
+        T alpha = param.alpha;
+        int pagerank_index = 1;
+        int has_guess = 0;
+        float tolerance = (sizeof(T) > 4 ?  1e-8f :  1e-6f) * param.tolerance_mul;
+        int max_iter = 1000;
+
+        status = nvgraphPagerank(handle, g1, weight_index, (void*)&alpha, bookmark_index, has_guess, pagerank_index, tolerance, max_iter);
+        cudaDeviceSynchronize();
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // run
+        if (PERF)
+        {
+            double start, stop;
+            start = second();
+            start = second();
+            int repeat = complex_repeats;
+            for (int i = 0; i < repeat; i++)
+            {
+                status = nvgraphPagerank(handle, g1, weight_index, (void*)&alpha, bookmark_index, has_guess, pagerank_index, tolerance, max_iter);
+                ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            }
+            cudaDeviceSynchronize();
+            stop = second();
+            printf("&&&& PERF Time_%s %10.8f -ms\n", test_id.c_str(), 1000.0*(stop-start)/repeat);
+        }
+
+        // get result
+        status = nvgraphGetVertexData(handle, g1, (void *)&calculated_res[0], pagerank_index);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        std::sort(calculated_res.begin(), calculated_res.end());
+    
+        // check with reference
+        if (param.result_file.length()>0)
+        {
+            fpin = fopen(param.result_file.c_str(),"rb");
+            ASSERT_TRUE(fpin != NULL) << " Cannot read file with reference data: " << param.result_file << std::endl;
+            std::vector<T> expected_res(n);
+            ASSERT_EQ(read_binary_vector(fpin, n, expected_res), 0);
+            fclose(fpin);
+            T tot_err = 0.0, err;
+            int n_err = 0;
+            for (int i = 0; i < n; i++)
+            {
+                err = fabs(expected_res[i] - calculated_res[i]);
+                if (err> nvgraph_Const<T>::tol)
+                {
+                    tot_err+=err;
+                    n_err++;
+                }
+            }
+            if (n_err)
+            {
+                EXPECT_NEAR(tot_err/n_err, nvgraph_Const<T>::tol, nvgraph_Const<T>::tol*9.99); // Network x used n*1e-10 for precision
+                ASSERT_LE(n_err, 0.001*n); // we tolerate 0.1% of values with a litte difference
+                //printf("number of incorrect entries: %d\n", n_err);
+            }
+        }
+
+        status = nvgraphDestroyGraphDescr(handle, g1);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        test_end = second();
+        if (print_test_timings) printf("Test took: %f seconds from which %f seconds were spent on data reading\n", test_end - test_start, read_end - read_start);
+    }
+};
+ 
+TEST_P(NVGraphCAPITests_Pagerank, CheckResultDouble)
+{
+    run_current_test<double>(GetParam());   
+}
+
+TEST_P(NVGraphCAPITests_Pagerank, CheckResultFloat)
+{
+    run_current_test<float>(GetParam());
+}
+
+class NVGraphCAPITests_KrylovPagerank : public ::testing::TestWithParam<Pagerank_Usecase> {
+  public:
+    NVGraphCAPITests_KrylovPagerank() : handle(NULL) {}
+
+    static void SetupTestCase() {}
+    static void TearDownTestCase() {}
+    virtual void SetUp() {
+        if (handle == NULL) {
+            status = nvgraphCreate(&handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+    }
+    virtual void TearDown() {
+        if (handle != NULL) {
+            status = nvgraphDestroy(handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            handle = NULL;
+        }
+    }
+    nvgraphStatus_t status;
+    nvgraphHandle_t handle;
+
+    template <typename T>
+    void run_current_test(const Pagerank_Usecase& param)
+    {
+        const ::testing::TestInfo* const test_info =::testing::UnitTest::GetInstance()->current_test_info();
+        std::stringstream ss; 
+        ss << param.alpha;
+        std::string test_id = std::string(test_info->test_case_name()) + std::string(".") + std::string(test_info->name()) + std::string("_") + getFileName(param.graph_file) + std::string("_") + ss.str().c_str();
+
+        if (param.graph_file == "dummy")
+        {
+            std::cout << "[  WAIVED  ] " << test_info->test_case_name() << "." << test_info->name() << std::endl;
+            return;
+        }
+
+        nvgraphTopologyType_t topo = NVGRAPH_CSC_32;
+
+        nvgraphStatus_t status;
+
+        FILE* fpin = fopen(param.graph_file.c_str(),"rb");
+        ASSERT_TRUE(fpin != NULL) << "Cannot read input graph file: " << param.graph_file << std::endl;
+        int n, nnz;
+        //Read a transposed network in amgx binary format and the bookmark of dangling nodes
+        ASSERT_EQ(read_header_amgx_csr_bin (fpin, n, nnz), 0);
+        std::vector<int> read_row_ptr(n+1), read_col_ind(nnz);
+        std::vector<T> read_val(nnz);
+        std::vector<T> dangling(n);
+        ASSERT_EQ(read_data_amgx_csr_bin_rhs (fpin, n, nnz, read_row_ptr, read_col_ind, read_val, dangling), 0);
+        fclose(fpin);
+
+        if (!enough_device_memory<T>(n, nnz, sizeof(int)*(read_row_ptr.size() + read_col_ind.size())))
+        {
+            std::cout << "[  WAIVED  ] " << test_info->test_case_name() << "." << test_info->name() << std::endl;
+            return;
+        }
+
+        nvgraphGraphDescr_t g1 = NULL;
+        status = nvgraphCreateGraphDescr(handle, &g1);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // set up graph
+        nvgraphCSCTopology32I_st topology = {n, nnz, &read_row_ptr[0], &read_col_ind[0]};
+        status = nvgraphSetGraphStructure(handle, g1, (void*)&topology, topo);
+
+        // set up graph data
+        std::vector<T> calculated_res(n, (T)1.0/n);
+        void*  vertexptr[2] = {(void*)&dangling[0], (void*)&calculated_res[0]};
+        cudaDataType_t type_v[2] = {nvgraph_Const<T>::Type, nvgraph_Const<T>::Type};
+        
+        void*  edgeptr[1] = {(void*)&read_val[0]};
+        cudaDataType_t type_e[1] = {nvgraph_Const<T>::Type};
+
+        status = nvgraphAllocateVertexData(handle, g1, 2, type_v);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetVertexData(handle, g1, vertexptr[0], 0);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetVertexData(handle, g1, vertexptr[1], 1);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphAllocateEdgeData(handle, g1, 1, type_e );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetEdgeData(handle, g1, (void *)edgeptr[0], 0);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        
+        int bookmark_index = 0;
+        int weight_index = 0;
+        T alpha = param.alpha;
+        int pagerank_index = 1;
+        int has_guess = 0;
+        float tolerance = (sizeof(T) > 4 ?  1e-8f :  1e-6f) * param.tolerance_mul;
+        int max_iter = 150;
+        int ss_sz = 7;
+
+
+        // run
+        if (PERF && n > PERF_ROWS_LIMIT)
+        {
+            double start, stop;
+            start = second();
+            start = second();
+            int repeat = 10;
+            for (int i = 0; i < repeat; i++)
+                status = nvgraphKrylovPagerank(handle, g1, weight_index, (void*)&alpha, bookmark_index, tolerance, max_iter, ss_sz, has_guess, pagerank_index);
+            stop = second();
+            printf("&&&& PERF Time_%s %10.8f -ms\n", test_id.c_str(), 1000.0*(stop-start)/repeat);
+        }
+        else
+            status = nvgraphKrylovPagerank(handle, g1, weight_index, (void*)&alpha, bookmark_index, tolerance, max_iter, ss_sz, has_guess, pagerank_index);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // get result
+        status = nvgraphGetVertexData(handle, g1, (void *)&calculated_res[0], pagerank_index);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        std::sort(calculated_res.begin(), calculated_res.end());
+    
+        // check with reference
+        if (param.result_file.length()>0)
+        {
+            fpin = fopen(param.result_file.c_str(),"rb");
+            ASSERT_TRUE(fpin != NULL) << " Cannot read file with reference data: " << param.result_file << std::endl;
+            std::vector<T> expected_res(n);
+            ASSERT_EQ(read_binary_vector(fpin, n, expected_res), 0);
+            fclose(fpin);
+            T tot_err = 0.0, err;
+            int n_err = 0;
+            for (int i = 0; i < n; i++)
+            {
+                err = fabs(expected_res[i] - calculated_res[i]);
+                if (err> nvgraph_Const<T>::tol)
+                {
+                    tot_err+=err;
+                    n_err++;
+                }
+            }
+            if (n_err)
+            {
+                EXPECT_NEAR(tot_err/n_err, nvgraph_Const<T>::tol, nvgraph_Const<T>::tol*9.99); // Network x used n*1e-10 for precision
+                ASSERT_LE(n_err, 0.001*n); // we tolerate 0.1% of values with a litte difference
+                //printf("number of incorrect entries: %d\n", n_err);
+            }
+        }
+
+        status = nvgraphDestroyGraphDescr(handle, g1);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    }
+
+};
+ 
+TEST_P(NVGraphCAPITests_KrylovPagerank, CheckResultDouble)
+{
+    run_current_test<double>(GetParam());   
+}
+
+TEST_P(NVGraphCAPITests_KrylovPagerank, CheckResultFloat)
+{
+    run_current_test<float>(GetParam());
+}
+
+/// Few sanity checks. 
+
+class NVGraphCAPITests_SrSPMV_Sanity : public ::testing::Test {
+  public:
+    nvgraphStatus_t status;
+    nvgraphHandle_t handle;
+    nvgraphTopologyType_t topo;
+    int n;
+    int nnz;
+    nvgraphGraphDescr_t g1;
+
+    NVGraphCAPITests_SrSPMV_Sanity() : handle(NULL) {}
+
+    static void SetupTestCase() {}
+    static void TearDownTestCase() {}
+    virtual void SetUp() {
+        topo = NVGRAPH_CSR_32;
+        nvgraphStatus_t status;
+        if (handle == NULL) {
+            status = nvgraphCreate(&handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+    }
+    virtual void TearDown() {
+        if (handle != NULL) {
+            status = nvgraphDestroy(handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            handle = NULL;
+        }
+    }
+    
+    template <typename T>
+    void prepare_and_run(const nvgraphCSRTopology32I_st& topo_st, T* edgedata, T* data1, T* data2, T alpha, T beta, T* expected )
+    {
+        g1 = NULL;
+        status = nvgraphCreateGraphDescr(handle, &g1);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // set up graph
+        n = topo_st.nvertices;
+        nnz = topo_st.nedges;
+        status = nvgraphSetGraphStructure(handle, g1, (void*)&topo_st, topo);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        cudaDataType_t type_v[2] = {nvgraph_Const<T>::Type, nvgraph_Const<T>::Type};
+        cudaDataType_t type_e[1] = {nvgraph_Const<T>::Type};
+        status = nvgraphAllocateVertexData(handle, g1, 2, type_v );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphAllocateEdgeData(handle, g1, 1, type_e);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        void*  vertexptr[2] = {(void*)data1, (void*)data2};
+        void*  edgeptr[1] = {(void*)edgedata};
+        int weight_index = 0;
+        int x_index = 0;
+        int y_index = 1;
+
+        status = nvgraphSetVertexData(handle, g1, vertexptr[0], x_index );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetVertexData(handle, g1, vertexptr[1], y_index );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetEdgeData(handle, g1, edgeptr[0], weight_index );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        status = nvgraphSrSpmv(handle, g1, weight_index, (void*)&alpha, x_index, (void*)&beta, y_index, NVGRAPH_PLUS_TIMES_SR);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // get result
+        std::vector<T> calculated_res(n);
+        status = nvgraphGetVertexData(handle, g1, (void *)&calculated_res[0], y_index);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        for (int row = 0; row < n; row++)
+        {
+            double reference_res = (double)expected[row];
+            double nvgraph_res = (double)calculated_res[row];
+            ASSERT_NEAR(reference_res, nvgraph_res, nvgraph_Const<T>::tol) << "row=" << row << " alpha=" << alpha << " beta=" << beta << "\n";
+        }
+
+        status = nvgraphDestroyGraphDescr(handle, g1);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    }
+
+    // Trivial matrix with trivial answers, checks plus_times sr only (but that is good enough) and some set of alfa and beta
+    template <typename T>
+    void run_simple_test()
+    {
+        n = 1024;
+        nnz = 1024;
+        std::vector<int> offsets(n+1), neighborhood(nnz);
+        std::vector<T> data1(n), data2(n);
+        for (int i = 0; i < n; i++)
+        {
+            data1[i] = (T)(1.0*rand()/RAND_MAX - 0.5);
+            data2[i] = (T)(1.0*rand()/RAND_MAX - 0.5);
+            offsets[i] = neighborhood[i] = i;
+        }
+        offsets[n] = n;
+        std::vector<T> edge_data(nnz, (T)(-2.0));
+        std::vector<T> expected_res(n, SR_OPS.get_ini<T>(NVGRAPH_PLUS_TIMES_SR));
+
+        nvgraphCSRTopology32I_st topology = {n, nnz, &offsets[0], &neighborhood[0]};
+
+        T pa[] = {-1.0, 0.0, 0.5, 1.0};
+        T pb[] = {-1.0, 0.0, 0.5, 1.0};
+        for (int ia = 0; ia < sizeof(pa)/sizeof(T); ia++)
+            for (int ib = 0; ib < sizeof(pb)/sizeof(T); ib++)
+            {
+                for (int i = 0; i < n; i++)
+                {
+                    expected_res[i] = SR_OPS.get_ini<T>(NVGRAPH_PLUS_TIMES_SR);
+                }
+                for (int i = 0; i < n; i++)
+                {
+                    T tv1 = SR_OPS.mul<T>(data1[i], edge_data[i], NVGRAPH_PLUS_TIMES_SR);
+                    tv1 = SR_OPS.mul<T>(tv1, pa[ia], NVGRAPH_PLUS_TIMES_SR);
+                    T tv2 = SR_OPS.mul<T>(data2[i], pb[ib], NVGRAPH_PLUS_TIMES_SR);
+                    tv2 = SR_OPS.plus<T>(tv1, tv2, NVGRAPH_PLUS_TIMES_SR);
+                    expected_res[i] = SR_OPS.plus<T>(expected_res[i], tv2, NVGRAPH_PLUS_TIMES_SR);
+                }
+                prepare_and_run<T>(topology, &edge_data[0], &data1[0], &data2[0], pa[ia], pb[ib], &expected_res[0]);
+            }
+    }
+};
+ 
+TEST_F(NVGraphCAPITests_SrSPMV_Sanity, SanityDouble)
+{
+    run_simple_test<double>();
+    
+}
+
+TEST_F(NVGraphCAPITests_SrSPMV_Sanity, SanityFloat)
+{
+    run_simple_test<float>();
+}
+
+class NVGraphCAPITests_SSSP_Sanity : public ::testing::Test {
+  public:
+    nvgraphStatus_t status;
+    nvgraphHandle_t handle;
+    nvgraphTopologyType_t topo;
+    int n;
+    int nnz;
+    nvgraphGraphDescr_t g1;
+
+    NVGraphCAPITests_SSSP_Sanity() : handle(NULL) {}
+
+    static void SetupTestCase() {}
+    static void TearDownTestCase() {}
+    virtual void SetUp() {
+        topo = NVGRAPH_CSC_32;
+        nvgraphStatus_t status;
+        if (handle == NULL) {
+            status = nvgraphCreate(&handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+    }
+    virtual void TearDown() {
+        if (handle != NULL) {
+            status = nvgraphDestroy(handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            handle = NULL;
+        }
+    }
+    
+    template <typename T>
+    void prepare_and_run(const nvgraphCSCTopology32I_st& topo_st, T* edgedata, T* expected )
+    {
+        g1 = NULL;
+        status = nvgraphCreateGraphDescr(handle, &g1);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // set up graph
+        n = topo_st.nvertices;
+        nnz = topo_st.nedges;
+        status = nvgraphSetGraphStructure(handle, g1, (void*)&topo_st, topo);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        cudaDataType_t type_v[1] = {nvgraph_Const<T>::Type};
+        cudaDataType_t type_e[1] = {nvgraph_Const<T>::Type};
+        status = nvgraphAllocateVertexData(handle, g1, 1, type_v );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphAllocateEdgeData(handle, g1, 1, type_e);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        void*  edgeptr[1] = {(void*)edgedata};
+        status = nvgraphSetEdgeData(handle, g1, edgeptr[0], 0 );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        int source_vert = 0;
+        int sssp_index = 0;
+        int weight_index = 0;
+
+        status = nvgraphSssp(handle, g1, weight_index, &source_vert, sssp_index);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status) << ", n=" << n << std::endl;
+
+        // get result
+        std::vector<T> calculated_res(n);
+        status = nvgraphGetVertexData(handle, g1, (void *)&calculated_res[0], sssp_index);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        for (int row = 0; row < n; row++)
+        {
+            double reference_res = (double)expected[row];
+            double nvgraph_res = (double)calculated_res[row];
+            ASSERT_NEAR(reference_res, nvgraph_res, nvgraph_Const<T>::tol) << "row=" << row << ", n=" << n << std::endl;
+        }
+
+        status = nvgraphDestroyGraphDescr(handle, g1);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    }
+
+// cycle graph, all weights = 1, shortest path = vertex number
+    template <typename T>
+    void run_cycle_test()
+    {
+        n = 1050;
+        nnz = n;
+        std::vector<int> offsets(n+1), neighborhood(n);
+        for (int i = 0; i < n; i++)
+        {
+            offsets[i] = i;
+            neighborhood[i] = (n - 1 + i) % n;
+        }
+        offsets[n] = n;
+        std::vector<T> edge_data(nnz, (T)1.0);
+        std::vector<T> expected_res(n, nvgraph_Const<T>::inf);
+        for (int i = 0; i < n; i++)
+        {
+            expected_res[i] = i;
+        }
+
+        // extensive run for small N's
+        for (int i = 3; i < 200; i++)
+        {
+            neighborhood[0] = i - 1;
+            nvgraphCSCTopology32I_st topology = {i, i, &offsets[0], &neighborhood[0]};
+            prepare_and_run<T>(topology, &edge_data[0], &expected_res[0]);
+        }
+        // also trying larger N's 
+        for (int i = 1020; i < 1030; i++)
+        {
+            neighborhood[0] = i - 1;
+            nvgraphCSCTopology32I_st topology = {i, i, &offsets[0], &neighborhood[0]};
+            prepare_and_run<T>(topology, &edge_data[0], &expected_res[0]);
+        }
+    }
+
+// full binary tree, all weights = 1, shortest path length = level of the node
+    template <typename T>
+    void run_tree_test()
+    {
+        int k = 3;
+        n = (1 << k) - 1;
+        nnz = (1 << k) - 2;
+        std::vector<int> offsets(n+1), neighborhood(n);
+        for (int i = 0; i < n; i++)
+        {
+            offsets[i+1] = i;
+        }
+        offsets[0] = 0;
+        for (int i = 0; i < nnz; i++)
+        {
+            neighborhood[i] = i / 2;
+        }
+        std::vector<T> edge_data(nnz, (T)1.0);
+        std::vector<T> expected_res(n, nvgraph_Const<T>::inf);
+        expected_res[0] = 0;
+        for (int i = 1; i < k; i++)
+        {
+            for (int v = 0; v < (1 << i); v++)
+                expected_res[(1 << i) - 1 + v] = i;
+        }
+
+        nvgraphCSCTopology32I_st topology = {n, nnz, &offsets[0], &neighborhood[0]};
+        
+        prepare_and_run<T>(topology, &edge_data[0], &expected_res[0]);
+    }
+};
+ 
+TEST_F(NVGraphCAPITests_SSSP_Sanity, SanityCycleDouble)
+{
+    run_cycle_test<double>();
+}
+
+TEST_F(NVGraphCAPITests_SSSP_Sanity, SanityCycleFloat)
+{
+    run_cycle_test<float>();
+}
+
+TEST_F(NVGraphCAPITests_SSSP_Sanity, SanityTreeDouble)
+{
+    run_tree_test<double>();
+}
+
+TEST_F(NVGraphCAPITests_SSSP_Sanity, SanityTreeFloat)
+{
+    run_tree_test<float>();
+}
+
+
+class NVGraphCAPITests_WidestPath_Sanity : public ::testing::Test {
+  public:
+    nvgraphStatus_t status;
+    nvgraphHandle_t handle;
+    nvgraphTopologyType_t topo;
+    int n;
+    int nnz;
+    nvgraphGraphDescr_t g1;
+
+    NVGraphCAPITests_WidestPath_Sanity() : handle(NULL) {}
+
+    static void SetupTestCase() {}
+    static void TearDownTestCase() {}
+    virtual void SetUp() {
+        topo = NVGRAPH_CSC_32;
+        nvgraphStatus_t status;
+        if (handle == NULL) {
+            status = nvgraphCreate(&handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+    }
+    virtual void TearDown() {
+        if (handle != NULL) {
+            status = nvgraphDestroy(handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            handle = NULL;
+        }
+    }
+    
+    template <typename T>
+    void prepare_and_run(const nvgraphCSCTopology32I_st& topo_st, T* edgedata, T* expected )
+    {
+        g1 = NULL;
+        status = nvgraphCreateGraphDescr(handle, &g1);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // set up graph
+        n = topo_st.nvertices;
+        nnz = topo_st.nedges;
+        status = nvgraphSetGraphStructure(handle, g1, (void*)&topo_st, topo);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        cudaDataType_t type_v[1] = {nvgraph_Const<T>::Type};
+        cudaDataType_t type_e[1] = {nvgraph_Const<T>::Type};
+        status = nvgraphAllocateVertexData(handle, g1, 1, type_v );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphAllocateEdgeData(handle, g1, 1, type_e);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        void*  edgeptr[1] = {(void*)edgedata};
+        status = nvgraphSetEdgeData(handle, g1, edgeptr[0], 0 );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        int source_vert = 0;
+        int widest_path_index = 0;
+        int weight_index = 0;
+
+        status = nvgraphWidestPath(handle, g1, weight_index, &source_vert, widest_path_index);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // get result
+        std::vector<T> calculated_res(n);
+        status = nvgraphGetVertexData(handle, g1, (void *)&calculated_res[0], widest_path_index);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        for (int row = 0; row < n; row++)
+        {
+            double reference_res = (double)expected[row];
+            double nvgraph_res = (double)calculated_res[row];
+            ASSERT_NEAR(reference_res, nvgraph_res, nvgraph_Const<T>::tol);
+        }
+
+        status = nvgraphDestroyGraphDescr(handle, g1);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    }
+
+// cycle graph, weigths are from n-1 to 0 starting with vertex = 0. widest path = [inf, n-1, n-2, ..., 1]
+    template <typename T>
+    void run_cycle_test()
+    {
+        n = 1024;
+        nnz = n;
+        std::vector<int> offsets(n+1), neighborhood(n);
+        for (int i = 0; i < n; i++)
+        {
+            offsets[i] = i;
+            neighborhood[i] = (n - 1 + i) % n;
+        }
+        offsets[n] = n;
+        std::vector<T> edge_data(nnz, 0);
+        std::vector<T> expected_res(n, nvgraph_Const<T>::inf);
+        for (int i = 1; i < n; i++)
+        {
+            edge_data[i] = (T)(n - i);
+        }
+        for (int i = 1; i < n; i++)
+        {
+            expected_res[i] = (T)(n - i);
+        }
+
+        nvgraphCSCTopology32I_st topology = {n, nnz, &offsets[0], &neighborhood[0]};
+        
+        prepare_and_run<T>(topology, &edge_data[0], &expected_res[0]);
+    }
+
+// cycle graph, edge weigths are equal to the (max_tree_lvl - edge_lvl). widest path to vertex is = (max_lvl - vertex_lvl)
+    template <typename T>
+    void run_tree_test()
+    {
+        int k = 10;
+        n = (1 << k) - 1;
+        nnz = (1 << k) - 2;
+        std::vector<int> offsets(n+1), neighborhood(n);
+        for (int i = 0; i < n; i++)
+        {
+            offsets[i+1] = i;
+        }
+        offsets[0] = 0;
+        for (int i = 0; i < nnz; i++)
+        {
+            neighborhood[i] = i / 2;
+        }
+        // fill edge data and expected res accordingly
+        std::vector<T> edge_data(nnz);
+        std::vector<T> expected_res(n, nvgraph_Const<T>::inf);
+        for (int i = 1; i < k; i++)
+        {
+            for (int v = 0; v < (1 << i); v++)
+            {
+                edge_data[(1 << i) - 2 + v] = (k - i);
+                expected_res[(1 << i) - 1 + v] = (k - i);
+            }
+        }
+
+        nvgraphCSCTopology32I_st topology = {n, nnz, &offsets[0], &neighborhood[0]};
+        
+        prepare_and_run<T>(topology, &edge_data[0], &expected_res[0]);
+    }
+};
+ 
+TEST_F(NVGraphCAPITests_WidestPath_Sanity, SanityCycleDouble)
+{
+    run_cycle_test<double>();
+}
+
+TEST_F(NVGraphCAPITests_WidestPath_Sanity, SanityCycleFloat)
+{
+    run_cycle_test<float>();
+}
+
+TEST_F(NVGraphCAPITests_WidestPath_Sanity, SanityTreeDouble)
+{
+    run_tree_test<double>();
+}
+
+TEST_F(NVGraphCAPITests_WidestPath_Sanity, SanityTreeFloat)
+{
+    run_tree_test<float>();
+}
+
+
+class NVGraphCAPITests_Pagerank_Sanity : public ::testing::Test {
+  public:
+    nvgraphStatus_t status;
+    nvgraphHandle_t handle;
+    nvgraphTopologyType_t topo;
+    int n;
+    int nnz;
+    nvgraphGraphDescr_t g1;
+
+    NVGraphCAPITests_Pagerank_Sanity() : handle(NULL) {}
+
+    static void SetupTestCase() {}
+    static void TearDownTestCase() {}
+    virtual void SetUp() {
+        topo = NVGRAPH_CSC_32;
+        nvgraphStatus_t status;
+        if (handle == NULL) {
+            status = nvgraphCreate(&handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+    }
+    virtual void TearDown() {
+        if (handle != NULL) {
+            status = nvgraphDestroy(handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            handle = NULL;
+        }
+    }
+    
+    template <typename T>
+    void prepare_and_run(const nvgraphCSCTopology32I_st& topo_st, T* bookmark, T* edge_data )
+    {
+        g1 = NULL;
+        status = nvgraphCreateGraphDescr(handle, &g1);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // set up graph
+        n = topo_st.nvertices;
+        nnz = topo_st.nedges;
+        status = nvgraphSetGraphStructure(handle, g1, (void*)&topo_st, topo);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        cudaDataType_t type_v[2] = {nvgraph_Const<T>::Type, nvgraph_Const<T>::Type};
+        cudaDataType_t type_e[1] = {nvgraph_Const<T>::Type};
+        status = nvgraphAllocateVertexData(handle, g1, 2, type_v );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphAllocateEdgeData(handle, g1, 1, type_e);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        int bookmark_index = 0;
+        int weight_index = 0;
+        T alpha = 0.85;
+        int pagerank_index = 1;
+        int has_guess = 0;
+        float tolerance = 1e-6;//sizeof(T) > 4 ?  1e-8f :  1e-6f;
+        int max_iter = 1000;
+
+        status = nvgraphSetVertexData(handle, g1, (void*)bookmark, bookmark_index );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        void*  edgeptr[1] = {(void*)edge_data};
+        status = nvgraphSetEdgeData(handle, g1, edgeptr[0], weight_index );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // run
+        status = nvgraphPagerank(handle, g1, weight_index, (void*)&alpha, bookmark_index, has_guess, pagerank_index, tolerance, max_iter);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // get result
+        std::vector<T> calculated_res(n);
+        status = nvgraphGetVertexData(handle, g1, (void *)&calculated_res[0], pagerank_index);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        for (int row = 1; row < n; row++)
+        {
+            //printf("PR[%d] == %10.7g, PR[%d] == %10.7g\n", row-1, calculated_res[row-1], row, calculated_res[row]);
+            double res1 = (double)calculated_res[row-1];
+            double res2 = (double)calculated_res[row];
+            ASSERT_LE(res1, res2) << "In row: " << row << "\n";
+        }
+
+        status = nvgraphDestroyGraphDescr(handle, g1);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    }
+
+// path graph, weigths are = 1, last node is dangling, pagerank should be in ascending order
+    template <typename T>
+    void run_path_test()
+    {
+        n = 1024;
+        nnz = n - 1;
+        std::vector<int> offsets(n+1), neighborhood(n);
+        for (int i = 0; i < n; i++)
+        {
+            offsets[1+i] = i;
+            neighborhood[i] = i;
+        }
+        offsets[0] = 0;
+        std::vector<T> edge_data(nnz, 1);
+        std::vector<T> dangling(n, 0);
+        dangling[n-1] = (T)(1);
+
+        nvgraphCSCTopology32I_st topology = {n, nnz, &offsets[0], &neighborhood[0]};
+        
+        prepare_and_run<T>(topology, &dangling[0], &edge_data[0]);
+    }
+};
+ 
+TEST_F(NVGraphCAPITests_Pagerank_Sanity, SanityPathDouble)
+{
+    run_path_test<double>();
+}
+
+TEST_F(NVGraphCAPITests_Pagerank_Sanity, SanitypathFloat)
+{
+    run_path_test<float>();
+}
+
+
+
+/// Corner cases for the C API
+
+class NVGraphCAPITests_SrSPMV_CornerCases : public ::testing::Test {
+  public:
+    nvgraphStatus_t status;
+    nvgraphHandle_t handle;
+    nvgraphTopologyType_t topo;
+    int n;
+    int nnz;
+    nvgraphGraphDescr_t g1;
+
+    NVGraphCAPITests_SrSPMV_CornerCases() : handle(NULL) {}
+
+    static void SetupTestCase() {}
+    static void TearDownTestCase() {}
+    virtual void SetUp() {
+        topo = NVGRAPH_CSR_32;
+        nvgraphStatus_t status;
+        if (handle == NULL) {
+            status = nvgraphCreate(&handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+    }
+    virtual void TearDown() {
+        if (handle != NULL) {
+            status = nvgraphDestroy(handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            handle = NULL;
+        }
+    }
+    
+    // Trivial matrix with trivial answers, checks plus_times sr only (but that is good enough) and sets of alfa and beta from {0.0, 1.0}
+    template <typename T>
+    void run_simple_test()
+    {
+        n = 1024;
+        nnz = 1024;
+        std::vector<int> offsets(n+1), neighborhood(nnz);
+        std::vector<T> data1(n), data2(n);
+        for (int i = 0; i < n; i++)
+        {
+            data1[i] = (T)(1.0*rand()/RAND_MAX - 0.5);
+            data2[i] = (T)(1.0*rand()/RAND_MAX - 0.5);
+            offsets[i] = neighborhood[i] = i;
+        }
+        offsets[n] = n;
+        std::vector<T> edge_data(nnz, (T)1.0);
+
+        nvgraphCSRTopology32I_st topology = {n, nnz, &offsets[0], &neighborhood[0]};
+        
+        T alpha = (T)(1.0);
+        T beta = (T)(1.0);
+        int weight_index = 0;
+        int x_index = 0;
+        int y_index = 1;
+
+        g1 = NULL;
+        status = nvgraphCreateGraphDescr(handle, &g1);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // set up graph
+        n = topology.nvertices;
+        nnz = topology.nedges;
+        status = nvgraphSetGraphStructure(handle, g1, (void*)&topology, topo);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        cudaDataType_t type_v[2] = {nvgraph_Const<T>::Type, nvgraph_Const<T>::Type};
+        cudaDataType_t type_e[1] = {nvgraph_Const<T>::Type};
+
+        // not multivalued CSR
+        status = nvgraphSrSpmv(handle, g1, weight_index, (void*)&alpha, x_index, (void*)&beta, y_index, NVGRAPH_PLUS_TIMES_SR);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+
+        status = nvgraphAllocateVertexData(handle, g1, 2, type_v );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphAllocateEdgeData(handle, g1, 1, type_e);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        void*  vertexptr[2] = {(void*)&data1[0], (void*)&data2[0]};
+        void*  edgeptr[1] = {(void*)(&edge_data[0])};
+        status = nvgraphSetVertexData(handle, g1, vertexptr[0], 0 );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetVertexData(handle, g1, vertexptr[1], 1 );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetEdgeData(handle, g1, edgeptr[0], 0 );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // different bad values
+        status = nvgraphSrSpmv(NULL, g1, weight_index, (void*)&alpha, x_index, (void*)&beta, y_index, NVGRAPH_PLUS_TIMES_SR);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphSrSpmv(handle, NULL, weight_index, (void*)&alpha, x_index, (void*)&beta, y_index, NVGRAPH_PLUS_TIMES_SR);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphSrSpmv(handle, g1, 10, (void*)&alpha, x_index, (void*)&beta, y_index, NVGRAPH_PLUS_TIMES_SR);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphSrSpmv(handle, g1, weight_index, (void*)&alpha, 10, (void*)&beta, y_index, NVGRAPH_PLUS_TIMES_SR);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphSrSpmv(handle, g1, weight_index, (void*)&alpha, x_index, (void*)&beta, 10, NVGRAPH_PLUS_TIMES_SR);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphSrSpmv(handle, g1, weight_index, (void*)&alpha, x_index, (void*)&beta, y_index, NVGRAPH_PLUS_TIMES_SR);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        status = nvgraphDestroyGraphDescr(handle, g1);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // only CSR is supported
+        {
+            status = nvgraphCreateGraphDescr(handle, &g1);  
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphSetGraphStructure(handle, g1, (void*)&topology, NVGRAPH_CSC_32);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphAllocateVertexData(handle, g1, 2, type_v );
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphAllocateEdgeData(handle, g1, 1, type_e);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphSrSpmv(handle, g1, weight_index, (void*)&alpha, x_index, (void*)&beta, y_index, NVGRAPH_PLUS_TIMES_SR);
+            ASSERT_NE(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphDestroyGraphDescr(handle, g1);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+
+        // only 32F and 64F real are supported
+        // but we cannot check SrSPMV for that because AllocateData will throw an error first
+        /*for (int i = 0; i < 10; i++)
+        {
+            if (i == CUDA_R_32F || i == CUDA_R_64F)
+                continue;
+            cudaDataType_t t_type_v[2] = {(cudaDataType_t)i, (cudaDataType_t)i};
+            cudaDataType_t t_type_e[1] = {(cudaDataType_t)i};
+            status = nvgraphCreateGraphDescr(handle, &g1);  
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphSetGraphStructure(handle, g1, (void*)&topology, NVGRAPH_CSR_32);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphAllocateVertexData(handle, g1, 2, t_type_v );
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphAllocateEdgeData(handle, g1, 1, t_type_e);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphSrSpmv(handle, g1, weight_index, (void*)&alpha, x_index, (void*)&beta, y_index, NVGRAPH_PLUS_TIMES_SR);
+            ASSERT_EQ(NVGRAPH_STATUS_TYPE_NOT_SUPPORTED, status);
+            status = nvgraphDestroyGraphDescr(handle, g1);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+        */
+    }
+};
+ 
+TEST_F(NVGraphCAPITests_SrSPMV_CornerCases, CornerCasesDouble)
+{
+    run_simple_test<double>();
+    
+}
+
+TEST_F(NVGraphCAPITests_SrSPMV_CornerCases, CornerCasesFloat)
+{
+    run_simple_test<float>();
+}
+
+
+class NVGraphCAPITests_SSSP_CornerCases : public ::testing::Test {
+  public:
+    nvgraphStatus_t status;
+    nvgraphHandle_t handle;
+    nvgraphTopologyType_t topo;
+    int n;
+    int nnz;
+    nvgraphGraphDescr_t g1;
+
+    NVGraphCAPITests_SSSP_CornerCases() : handle(NULL) {}
+
+    static void SetupTestCase() {}
+    static void TearDownTestCase() {}
+    virtual void SetUp() {
+        topo = NVGRAPH_CSC_32;
+        nvgraphStatus_t status;
+        if (handle == NULL) {
+            status = nvgraphCreate(&handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+    }
+    virtual void TearDown() {
+        if (handle != NULL) {
+            status = nvgraphDestroy(handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            handle = NULL;
+        }
+    }
+
+    template <typename T>
+    void run_cycle_test()
+    {
+        n = 1024;
+        nnz = n;
+        std::vector<int> offsets(n+1), neighborhood(n);
+        for (int i = 0; i < n; i++)
+        {
+            offsets[i] = i;
+            neighborhood[i] = (n - 1 + i) % n;
+        }
+        offsets[n] = n;
+        std::vector<T> edge_data(nnz, (T)1.0);
+
+        nvgraphCSCTopology32I_st topology = {n, nnz, &offsets[0], &neighborhood[0]};
+        
+        int source_vert = 0;
+        int sssp_index = 0;
+        int weight_index = 0;
+        
+        g1 = NULL;
+        status = nvgraphCreateGraphDescr(handle, &g1);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // set up graph
+        status = nvgraphSetGraphStructure(handle, g1, (void*)&topology, topo);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // only multivaluedCSR are supported
+        status = nvgraphSssp(handle, g1, weight_index, &source_vert, sssp_index);
+        ASSERT_NE(NVGRAPH_STATUS_SUCCESS, status);
+
+        cudaDataType_t type_v[1] = {nvgraph_Const<T>::Type};
+        cudaDataType_t type_e[1] = {nvgraph_Const<T>::Type};
+        status = nvgraphAllocateVertexData(handle, g1, 1, type_v );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphAllocateEdgeData(handle, g1, 1, type_e);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        void*  edgeptr[1] = {(void*)&edge_data[0]};
+        status = nvgraphSetEdgeData(handle, g1, edgeptr[0], 0 );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+
+        status = nvgraphSssp(NULL, g1, weight_index, &source_vert, sssp_index);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphSssp(handle, NULL, weight_index, &source_vert, sssp_index);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphSssp(handle, g1, 500, &source_vert, sssp_index);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphSssp(handle, g1, weight_index, NULL, sssp_index);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphSssp(handle, g1, weight_index, &source_vert, 500);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphSssp(handle, g1, weight_index, &source_vert, sssp_index);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        status = nvgraphDestroyGraphDescr(handle, g1);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // only CSC is supported
+        {
+            status = nvgraphCreateGraphDescr(handle, &g1);  
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphSetGraphStructure(handle, g1, (void*)&topology, NVGRAPH_CSR_32);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphAllocateVertexData(handle, g1, 1, type_v );
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphAllocateEdgeData(handle, g1, 1, type_e);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphSssp(handle, g1, weight_index, &source_vert, sssp_index);
+            ASSERT_NE(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphDestroyGraphDescr(handle, g1);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+
+        // only 32F and 64F real are supported
+        // but we cannot check SSSP for that because AllocateData will throw an error first
+        /*for (int i = 0; i < 10; i++)
+        {
+            if (i == CUDA_R_32F || i == CUDA_R_64F)
+                continue;
+            cudaDataType_t t_type_v[2] = {(cudaDataType_t)i, (cudaDataType_t)i};
+            cudaDataType_t t_type_e[1] = {(cudaDataType_t)i};
+            status = nvgraphCreateGraphDescr(handle, &g1);  
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphSetGraphStructure(handle, g1, (void*)&topology, NVGRAPH_CSC_32);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphAllocateVertexData(handle, g1, 1, t_type_v );
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphAllocateEdgeData(handle, g1, 1, t_type_e);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphSssp(handle, g1, weight_index, &source_vert, sssp_index);
+            ASSERT_EQ(NVGRAPH_STATUS_TYPE_NOT_SUPPORTED, status);
+            status = nvgraphDestroyGraphDescr(handle, g1);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+        */
+    }
+};
+ 
+TEST_F(NVGraphCAPITests_SSSP_CornerCases, CornerCasesDouble)
+{
+    run_cycle_test<double>();
+}
+
+TEST_F(NVGraphCAPITests_SSSP_CornerCases, CornerCasesFloat)
+{
+    run_cycle_test<float>();
+}
+
+
+
+class NVGraphCAPITests_WidestPath_CornerCases : public ::testing::Test {
+  public:
+    nvgraphStatus_t status;
+    nvgraphHandle_t handle;
+    nvgraphTopologyType_t topo;
+    int n;
+    int nnz;
+    nvgraphGraphDescr_t g1;
+
+    NVGraphCAPITests_WidestPath_CornerCases() : handle(NULL) {}
+
+    static void SetupTestCase() {}
+    static void TearDownTestCase() {}
+    virtual void SetUp() {
+        topo = NVGRAPH_CSC_32;
+        nvgraphStatus_t status;
+        if (handle == NULL) {
+            status = nvgraphCreate(&handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+    }
+    virtual void TearDown() {
+        if (handle != NULL) {
+            status = nvgraphDestroy(handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            handle = NULL;
+        }
+    }
+
+    template <typename T>
+    void run_test()
+    {
+        n = 1024;
+        nnz = n;
+        std::vector<int> offsets(n+1), neighborhood(n);
+        for (int i = 0; i < n; i++)
+        {
+            offsets[i] = i;
+            neighborhood[i] = (n - 1 + i) % n;
+        }
+        offsets[n] = n;
+        std::vector<T> edge_data(nnz, (T)1.0);
+        std::vector<T> expected_res(n, nvgraph_Const<T>::inf);
+        for (int i = 0; i < n; i++)
+        {
+            expected_res[i] = i;
+        }
+
+        nvgraphCSCTopology32I_st topology = {n, nnz, &offsets[0], &neighborhood[0]};
+        
+                g1 = NULL;
+        status = nvgraphCreateGraphDescr(handle, &g1);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // set up graph
+        status = nvgraphSetGraphStructure(handle, g1, (void*)&topology, topo);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        int source_vert = 0;
+        int widest_path_index = 0;
+        int weight_index = 0;
+
+        status = nvgraphWidestPath(handle, g1, weight_index, &source_vert, widest_path_index);
+        ASSERT_NE(NVGRAPH_STATUS_SUCCESS, status);
+
+
+        cudaDataType_t type_v[1] = {nvgraph_Const<T>::Type};
+        cudaDataType_t type_e[1] = {nvgraph_Const<T>::Type};
+        status = nvgraphAllocateVertexData(handle, g1, 1, type_v );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphAllocateEdgeData(handle, g1, 1, type_e);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        void*  edgeptr[1] = {(void*)&edge_data[0]};
+        status = nvgraphSetEdgeData(handle, g1, edgeptr[0], 0 );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+
+        status = nvgraphWidestPath(NULL, g1, weight_index, &source_vert, widest_path_index);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphWidestPath(handle, NULL, weight_index, &source_vert, widest_path_index);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphWidestPath(handle, g1, 500, &source_vert, widest_path_index);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphWidestPath(handle, g1, weight_index, NULL, widest_path_index);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphWidestPath(handle, g1, weight_index, &source_vert, 500);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphWidestPath(handle, g1, weight_index, &source_vert, widest_path_index);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        status = nvgraphDestroyGraphDescr(handle, g1);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // only CSC is supported
+        {
+            status = nvgraphCreateGraphDescr(handle, &g1);  
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphSetGraphStructure(handle, g1, (void*)&topology, NVGRAPH_CSR_32);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphAllocateVertexData(handle, g1, 1, type_v );
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphAllocateEdgeData(handle, g1, 1, type_e);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphWidestPath(handle, g1, weight_index, &source_vert, widest_path_index);
+            ASSERT_NE(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphDestroyGraphDescr(handle, g1);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+
+        // only 32F and 64F real are supported
+        // but we cannot check WidestPath for that because AllocateData will throw an error first
+        /*for (int i = 0; i < 10; i++)
+        {
+            if (i == CUDA_R_32F || i == CUDA_R_64F)
+                continue;
+            cudaDataType_t t_type_v[2] = {(cudaDataType_t)i, (cudaDataType_t)i};
+            cudaDataType_t t_type_e[1] = {(cudaDataType_t)i};
+            status = nvgraphCreateGraphDescr(handle, &g1);  
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphSetGraphStructure(handle, g1, (void*)&topology, NVGRAPH_CSC_32);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphAllocateVertexData(handle, g1, 1, t_type_v );
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphAllocateEdgeData(handle, g1, 1, t_type_e);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphWidestPath(handle, g1, weight_index, &source_vert, widest_path_index);
+            ASSERT_EQ(NVGRAPH_STATUS_TYPE_NOT_SUPPORTED, status);
+            status = nvgraphDestroyGraphDescr(handle, g1);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+        */
+    }
+};
+ 
+TEST_F(NVGraphCAPITests_WidestPath_CornerCases, CornerCasesDouble)
+{
+    run_test<double>();
+}
+
+TEST_F(NVGraphCAPITests_WidestPath_CornerCases, CornerCasesFloat)
+{
+    run_test<float>();
+}
+
+
+class NVGraphCAPITests_Pagerank_CornerCases : public ::testing::Test {
+  public:
+    nvgraphStatus_t status;
+    nvgraphHandle_t handle;
+    nvgraphTopologyType_t topo;
+    int n;
+    int nnz;
+    nvgraphGraphDescr_t g1;
+
+    NVGraphCAPITests_Pagerank_CornerCases() : handle(NULL) {}
+
+    static void SetupTestCase() {}
+    static void TearDownTestCase() {}
+    virtual void SetUp() {
+        topo = NVGRAPH_CSC_32;
+        nvgraphStatus_t status;
+        if (handle == NULL) {
+            status = nvgraphCreate(&handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+    }
+    virtual void TearDown() {
+        if (handle != NULL) {
+            status = nvgraphDestroy(handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            handle = NULL;
+        }
+    }
+
+
+    template <typename T>
+    void run_test()
+    {
+        n = 1024;
+        nnz = n - 1;
+        std::vector<int> offsets(n+1), neighborhood(n);
+        for (int i = 0; i < n; i++)
+        {
+            offsets[1+i] = i;
+            neighborhood[i] = i;
+        }
+        offsets[0] = 0;
+        std::vector<T> edge_data(nnz, 1.0);
+        std::vector<T> dangling(n, 0);
+        dangling[n-1] = (T)(1);
+
+        nvgraphCSCTopology32I_st topology = {n, nnz, &offsets[0], &neighborhood[0]};
+        
+        g1 = NULL;
+        status = nvgraphCreateGraphDescr(handle, &g1);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // set up graph
+        n = topology.nvertices;
+        nnz = topology.nedges;
+        status = nvgraphSetGraphStructure(handle, g1, (void*)&topology, topo);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        cudaDataType_t type_v[2] = {nvgraph_Const<T>::Type, nvgraph_Const<T>::Type};
+        cudaDataType_t type_e[1] = {nvgraph_Const<T>::Type};
+
+        int bookmark_index = 0;
+        int weight_index = 0;
+        T alpha = 0.85;
+        T alpha_bad = -10.0;
+        int pagerank_index = 1;
+        int has_guess = 0;
+        float tolerance = 1e-6;//sizeof(T) > 4 ?  1e-8f :  1e-6f;
+        int max_iter = 1000;
+
+        // should be multivalued
+        status = nvgraphPagerank(handle, g1, weight_index, (void*)&alpha, bookmark_index, has_guess, pagerank_index, tolerance, max_iter);
+        ASSERT_NE(NVGRAPH_STATUS_SUCCESS, status);
+
+        status = nvgraphAllocateVertexData(handle, g1, 2, type_v );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphAllocateEdgeData(handle, g1, 1, type_e);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+
+        status = nvgraphSetVertexData(handle, g1, (void*)&dangling[0], bookmark_index );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetEdgeData(handle, g1, (void*)&edge_data[0], weight_index );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // different invalid values
+        status = nvgraphPagerank(NULL, g1, weight_index, (void*)&alpha, bookmark_index, has_guess, pagerank_index, tolerance, max_iter);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphPagerank(handle, NULL, weight_index, (void*)&alpha, bookmark_index, has_guess, pagerank_index, tolerance, max_iter);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphPagerank(handle, g1, 500, (void*)&alpha, bookmark_index, has_guess, pagerank_index, tolerance, max_iter);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphPagerank(handle, g1, weight_index, NULL, bookmark_index, has_guess, pagerank_index, tolerance, max_iter);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphPagerank(handle, g1, weight_index, (void*)&alpha_bad, bookmark_index, has_guess, pagerank_index, tolerance, max_iter);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphPagerank(handle, g1, weight_index, (void*)&alpha, 500, has_guess, pagerank_index, tolerance, max_iter);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphPagerank(handle, g1, weight_index, (void*)&alpha, bookmark_index, 500, pagerank_index, tolerance, max_iter);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphPagerank(handle, g1, weight_index, (void*)&alpha, bookmark_index, has_guess, 500, tolerance, max_iter);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphPagerank(handle, g1, weight_index, (void*)&alpha, bookmark_index, has_guess, pagerank_index, -10.0f, max_iter);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphPagerank(handle, g1, weight_index, (void*)&alpha, bookmark_index, has_guess, pagerank_index, 10.0f, max_iter);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphPagerank(handle, g1, weight_index, (void*)&alpha, bookmark_index, has_guess, pagerank_index, tolerance, max_iter);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        status = nvgraphDestroyGraphDescr(handle, g1);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        {
+            status = nvgraphCreateGraphDescr(handle, &g1);  
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphSetGraphStructure(handle, g1, (void*)&topology, NVGRAPH_CSR_32);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphAllocateVertexData(handle, g1, 2, type_v );
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphAllocateEdgeData(handle, g1, 1, type_e);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphPagerank(handle, g1, weight_index, (void*)&alpha, bookmark_index, has_guess, pagerank_index, tolerance, max_iter);
+            ASSERT_NE(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphDestroyGraphDescr(handle, g1);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+
+        // only 32F and 64F real are supported
+        // but we cannot check Pagerank for that because AllocateData will throw an error first
+        /*for (int i = 0; i < 10; i++)
+        {
+            if (i == CUDA_R_32F || i == CUDA_R_64F)
+                continue;
+            cudaDataType_t t_type_v[2] = {(cudaDataType_t)i, (cudaDataType_t)i};
+            cudaDataType_t t_type_e[1] = {(cudaDataType_t)i};
+            status = nvgraphCreateGraphDescr(handle, &g1);  
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphSetGraphStructure(handle, g1, (void*)&topology, NVGRAPH_CSC_32);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphAllocateVertexData(handle, g1, 2, t_type_v );
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphAllocateEdgeData(handle, g1, 1, t_type_e);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphPagerank(handle, g1, weight_index, (void*)&alpha, bookmark_index, has_guess, pagerank_index, tolerance, max_iter);
+            ASSERT_EQ(NVGRAPH_STATUS_TYPE_NOT_SUPPORTED, status);
+            status = nvgraphDestroyGraphDescr(handle, g1);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+        */
+    }
+};
+ 
+TEST_F(NVGraphCAPITests_Pagerank_CornerCases, CornerCasesDouble)
+{
+    run_test<double>();
+}
+
+TEST_F(NVGraphCAPITests_Pagerank_CornerCases, CornerCasesFloat)
+{
+    run_test<float>();
+}
+
+
+class NVGraphCAPITests_SrSPMV_Stress : public ::testing::TestWithParam<SrSPMV_Usecase> {
+  public:
+    NVGraphCAPITests_SrSPMV_Stress() : handle(NULL) {}
+
+    static void SetupTestCase() {}
+    static void TearDownTestCase() {}
+    virtual void SetUp() {
+        //const ::testing::TestInfo* const test_info =::testing::UnitTest::GetInstance()->current_test_info();
+        //printf("We are in test %s of test case %s.\n", test_info->name(), test_info->test_case_name());
+        if (handle == NULL) {
+            status = nvgraphCreate(&handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+    }
+    virtual void TearDown() {
+        if (handle != NULL) {
+            status = nvgraphDestroy(handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            handle = NULL;
+        }
+    }
+    nvgraphStatus_t status;
+    nvgraphHandle_t handle;
+
+    template <typename T>
+    void run_current_test(const SrSPMV_Usecase& param)
+    {
+        nvgraphTopologyType_t topo = NVGRAPH_CSR_32;
+
+        nvgraphStatus_t status;
+
+        FILE* fpin = fopen(param.graph_file.c_str(),"rb");
+        ASSERT_TRUE(fpin != NULL) << "Cannot read input graph file: " << param.graph_file << std::endl;
+        int n, nnz;
+        //Read a transposed network in amgx binary format and the bookmark of dangling nodes
+        ASSERT_EQ(read_header_amgx_csr_bin (fpin, n, nnz), 0);
+        std::vector<int> read_row_ptr(n+1), read_col_ind(nnz);
+        std::vector<T> read_val(nnz);
+        ASSERT_EQ(read_data_amgx_csr_bin (fpin, n, nnz, read_row_ptr, read_col_ind, read_val), 0);
+        fclose(fpin);
+
+        const ::testing::TestInfo* const test_info =::testing::UnitTest::GetInstance()->current_test_info();
+
+        if (!enough_device_memory<T>(n, nnz, sizeof(int)*(read_row_ptr.size() + read_col_ind.size())))
+        {
+            std::cout << "[  WAIVED  ] " << test_info->test_case_name() << "." << test_info->name() << std::endl;
+            return;
+        }
+
+        nvgraphGraphDescr_t g1 = NULL;
+        status = nvgraphCreateGraphDescr(handle, &g1);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // set up graph
+        nvgraphCSCTopology32I_st topology = {n, nnz, &read_row_ptr[0], &read_col_ind[0]};
+        status = nvgraphSetGraphStructure(handle, g1, (void*)&topology, topo);
+
+        // set up graph data
+        //@TODO: random fill?
+        std::vector<T> calculated_res(n);
+        std::vector<T> data1(n), data2(n);
+        for (int i = 0; i < n; i++)
+        {
+            data1[i] = (T)(1.0*rand()/RAND_MAX - 0.5);
+            data2[i] = (T)(1.0*rand()/RAND_MAX - 0.5);
+        }
+        void*  vertexptr[2] = {(void*)&data1[0], (void*)&data2[0]};
+        cudaDataType_t type_v[2] = {nvgraph_Const<T>::Type, nvgraph_Const<T>::Type};
+        
+        void*  edgeptr[1] = {(void*)&read_val[0]};
+        cudaDataType_t type_e[1] = {nvgraph_Const<T>::Type};
+
+        status = nvgraphAllocateVertexData(handle, g1, 2, type_v );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetVertexData(handle, g1, vertexptr[0], 0 );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetVertexData(handle, g1, vertexptr[1], 1 );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        status = nvgraphAllocateEdgeData(handle, g1, 1, type_e);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetEdgeData(handle, g1, (void *)edgeptr[0], 0 );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        int weight_index = 0;
+        int x_index = 0;
+        int y_index = 1;
+
+        // reinit data
+        status = nvgraphSetVertexData(handle, g1, (void*)&data2[0], y_index);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+
+        T alphaT = (T)param.alpha;
+        T betaT = (T)param.beta;
+
+        // run
+        int repeat = std::max((int)(((float)(SRSPMV_ITER_MULTIPLIER)*STRESS_MULTIPLIER)/n), 1);
+        //printf ("Repeating C API call for %d times\n", repeat);
+        std::vector<T> calculated_res1(n), calculated_res_mid(n);
+        size_t free_mid = 0, free_last = 0, total = 0;      
+        for (int i = 0; i < repeat; i++)
+        {
+//            cudaMemGetInfo(&t, &total);
+//            printf("Iteration: %d, freemem: %zu\n", i, t);
+
+            status = nvgraphSrSpmv(handle, g1, weight_index, (void*)&alphaT, x_index, (void*)&betaT, y_index, param.sr);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+            // all of those should be equal
+            if (i == 0)
+            {
+                status = nvgraphGetVertexData(handle, g1, (void *)&calculated_res1[0], y_index);
+                ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            }
+            else
+            {
+                status = nvgraphGetVertexData(handle, g1, (void *)&calculated_res_mid[0], y_index);
+                ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+                for (int row = 0; row < n; row++)
+                {
+                    // stronger condition - bit by bit equality
+                    /*
+                    if (calculated_res1[row] != calculated_res_mid[row])
+                    {
+                        typename nvgraph_Const<T>::fpint_st comp1, comp2;
+                        comp1.f = calculated_res1[row];
+                        comp2.f = calculated_res_mid[row];
+                        ASSERT_EQ(comp1.u, comp2.u) << "Difference in result in row #" << row << " graph " << param.graph_file << " for iterations #0 and iteration #" << i;
+                    }
+                    */
+                    ASSERT_NEAR(calculated_res1[row], calculated_res_mid[row], nvgraph_Const<T>::tol) << "Difference in result in row #" << row << " graph " << param.graph_file << " for iterations #0 and iteration #" <<  i;
+                }
+            }
+            if (i == std::min(50, (int)(repeat/2)))
+            {
+                cudaMemGetInfo(&free_mid, &total);
+            }
+            if (i == repeat-1)
+            {
+                cudaMemGetInfo(&free_last, &total);
+            }
+
+            // reset vectors
+            status = nvgraphSetVertexData(handle, g1, vertexptr[0], 0 );
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphSetVertexData(handle, g1, vertexptr[1], 1 );
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+
+        ASSERT_LE(free_mid, free_last) << "Memory difference between iteration #" << std::min(50, (int)(repeat/2)) << " and last iteration is " << (double)(free_last-free_mid)/1e+6 << "MB";
+
+        status = nvgraphDestroyGraphDescr(handle, g1);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    }
+};
+ 
+TEST_P(NVGraphCAPITests_SrSPMV_Stress, StressDouble)
+{
+    run_current_test<double>(GetParam());
+    
+}
+
+TEST_P(NVGraphCAPITests_SrSPMV_Stress, StressFloat)
+{
+    run_current_test<float>(GetParam());
+}
+
+
+
+class NVGraphCAPITests_Widest_Stress : public ::testing::TestWithParam<WidestPath_Usecase> {
+  public:
+    NVGraphCAPITests_Widest_Stress() : handle(NULL) {}
+
+    static void SetupTestCase() {}
+    static void TearDownTestCase() {}
+    virtual void SetUp() {
+        //const ::testing::TestInfo* const test_info =::testing::UnitTest::GetInstance()->current_test_info();
+        //printf("We are in test %s of test case %s.\n", test_info->name(), test_info->test_case_name());
+        if (handle == NULL) {
+            status = nvgraphCreate(&handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+    }
+    virtual void TearDown() {
+        if (handle != NULL) {
+            status = nvgraphDestroy(handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            handle = NULL;
+        }
+    }
+    nvgraphStatus_t status;
+    nvgraphHandle_t handle;
+
+    template <typename T>
+    void run_current_test(const WidestPath_Usecase& param)
+    {
+        nvgraphTopologyType_t topo = NVGRAPH_CSC_32;
+
+        nvgraphStatus_t status;
+
+        FILE* fpin = fopen(param.graph_file.c_str(),"rb");
+        ASSERT_TRUE(fpin != NULL) << "Cannot read input graph file: " << param.graph_file << std::endl;
+        int n, nnz;
+        //Read a transposed network in amgx binary format and the bookmark of dangling nodes
+        ASSERT_EQ(read_header_amgx_csr_bin (fpin, n, nnz), 0);
+        std::vector<int> read_row_ptr(n+1), read_col_ind(nnz);
+        std::vector<T> read_val(nnz);
+        ASSERT_EQ(read_data_amgx_csr_bin (fpin, n, nnz, read_row_ptr, read_col_ind, read_val), 0);
+        fclose(fpin);
+
+        const ::testing::TestInfo* const test_info =::testing::UnitTest::GetInstance()->current_test_info();
+
+        if (!enough_device_memory<T>(n, nnz, sizeof(int)*(read_row_ptr.size() + read_col_ind.size())))
+        {
+            std::cout << "[  WAIVED  ] " << test_info->test_case_name() << "." << test_info->name() << std::endl;
+            return;
+        }
+
+        nvgraphGraphDescr_t g1 = NULL;
+        status = nvgraphCreateGraphDescr(handle, &g1);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // set up graph
+        nvgraphCSCTopology32I_st topology = {n, nnz, &read_row_ptr[0], &read_col_ind[0]};
+        status = nvgraphSetGraphStructure(handle, g1, (void*)&topology, topo);
+
+       // set up graph data
+        size_t numsets = 1;
+        std::vector<T> calculated_res(n);
+        //void*  vertexptr[1] = {(void*)&calculated_res[0]};
+        cudaDataType_t type_v[1] = {nvgraph_Const<T>::Type};
+        
+        void*  edgeptr[1] = {(void*)&read_val[0]};
+        cudaDataType_t type_e[1] = {nvgraph_Const<T>::Type};
+
+        status = nvgraphAllocateVertexData(handle, g1, numsets, type_v);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphAllocateEdgeData(handle, g1, numsets, type_e );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetEdgeData(handle, g1, (void *)edgeptr[0], 0);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        int weight_index = 0;
+        int source_vert = param.source_vert;
+        int widest_path_index = 0;
+
+        // run
+        int repeat = std::max((int)(((float)(WIDEST_ITER_MULTIPLIER)*STRESS_MULTIPLIER)/(3*n)), 1);
+        //printf ("Repeating C API call for %d times\n", repeat);
+        std::vector<T> calculated_res1(n), calculated_res_mid(n);
+        size_t free_mid = 0, free_last = 0, total = 0;      
+        for (int i = 0; i < repeat; i++)
+        {
+            //cudaMemGetInfo(&t, &total);
+            //printf("Iteration: %d, freemem: %zu\n", i, t);
+
+            status = nvgraphWidestPath(handle, g1, weight_index, &source_vert, widest_path_index);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+            // all of those should be equal
+            if (i == 0)
+            {
+                status = nvgraphGetVertexData(handle, g1, (void *)&calculated_res1[0], widest_path_index);
+                ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            }
+            else
+            {
+                status = nvgraphGetVertexData(handle, g1, (void *)&calculated_res_mid[0], widest_path_index);
+                ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+                for (int row = 0; row < n; row++)
+                {
+                    // stronger condition - bit by bit equality
+                    /*
+                    if (calculated_res1[row] != calculated_res_mid[row])
+                    {
+                        typename nvgraph_Const<T>::fpint_st comp1, comp2;
+                        comp1.f = calculated_res1[row];
+                        comp2.f = calculated_res_mid[row];
+                        ASSERT_EQ(comp1.u, comp2.u) << "Difference in result in row #" << row << " graph " << param.graph_file << " for iterations #0 and iteration #" << i;
+                    }
+                    */
+                    ASSERT_NEAR(calculated_res1[row], calculated_res_mid[row], nvgraph_Const<T>::tol) << "Difference in result in row #" << row << " graph " << param.graph_file << " for iterations #0 and iteration #" <<  i;
+                }
+            }
+
+            if (i == std::min(50, (int)(repeat/2)))
+            {
+                cudaMemGetInfo(&free_mid, &total);
+            }
+            if (i == repeat-1)
+            {
+                cudaMemGetInfo(&free_last, &total);
+            }
+        }
+
+        ASSERT_LE(free_mid, free_last) << "Memory difference between iteration #" << std::min(50, (int)(repeat/2)) << " and last iteration is " << (double)(free_last-free_mid)/1e+6 << "MB";
+
+        status = nvgraphDestroyGraphDescr(handle, g1);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    }
+};
+ 
+TEST_P(NVGraphCAPITests_Widest_Stress, StressDouble)
+{
+    run_current_test<double>(GetParam());
+    
+}
+
+TEST_P(NVGraphCAPITests_Widest_Stress, StressFloat)
+{
+    run_current_test<float>(GetParam());
+}
+
+
+
+
+class NVGraphCAPITests_SSSP_Stress : public ::testing::TestWithParam<SSSP_Usecase> {
+  public:
+    NVGraphCAPITests_SSSP_Stress() : handle(NULL) {}
+
+    static void SetupTestCase() {}
+    static void TearDownTestCase() {}
+    virtual void SetUp() {
+        //const ::testing::TestInfo* const test_info =::testing::UnitTest::GetInstance()->current_test_info();
+        //printf("We are in test %s of test case %s.\n", test_info->name(), test_info->test_case_name());
+        if (handle == NULL) {
+            status = nvgraphCreate(&handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+    }
+    virtual void TearDown() {
+        if (handle != NULL) {
+            status = nvgraphDestroy(handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            handle = NULL;
+        }
+    }
+    nvgraphStatus_t status;
+    nvgraphHandle_t handle;
+
+    template <typename T>
+    void run_current_test(const SSSP_Usecase& param)
+    {
+        nvgraphTopologyType_t topo = NVGRAPH_CSC_32;
+
+        nvgraphStatus_t status;
+
+        FILE* fpin = fopen(param.graph_file.c_str(),"rb");
+        ASSERT_TRUE(fpin != NULL) << "Cannot read input graph file: " << param.graph_file << std::endl;
+        int n, nnz;
+        //Read a transposed network in amgx binary format and the bookmark of dangling nodes
+        ASSERT_EQ(read_header_amgx_csr_bin (fpin, n, nnz), 0);
+        std::vector<int> read_row_ptr(n+1), read_col_ind(nnz);
+        std::vector<T> read_val(nnz);
+        ASSERT_EQ(read_data_amgx_csr_bin (fpin, n, nnz, read_row_ptr, read_col_ind, read_val), 0);
+        fclose(fpin);
+
+        const ::testing::TestInfo* const test_info =::testing::UnitTest::GetInstance()->current_test_info();
+
+        if (!enough_device_memory<T>(n, nnz, sizeof(int)*(read_row_ptr.size() + read_col_ind.size())))
+        {
+            std::cout << "[  WAIVED  ] " << test_info->test_case_name() << "." << test_info->name() << std::endl;
+            return;
+        }
+
+        nvgraphGraphDescr_t g1 = NULL;
+        status = nvgraphCreateGraphDescr(handle, &g1);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // set up graph
+        nvgraphCSCTopology32I_st topology = {n, nnz, &read_row_ptr[0], &read_col_ind[0]};
+        status = nvgraphSetGraphStructure(handle, g1, (void*)&topology, topo);
+
+       // set up graph data
+        size_t numsets = 1;
+        std::vector<T> calculated_res(n);
+        //void*  vertexptr[1] = {(void*)&calculated_res[0]};
+        cudaDataType_t type_v[1] = {nvgraph_Const<T>::Type};
+        
+        void*  edgeptr[1] = {(void*)&read_val[0]};
+        cudaDataType_t type_e[1] = {nvgraph_Const<T>::Type};
+
+        status = nvgraphAllocateVertexData(handle, g1, numsets, type_v);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphAllocateEdgeData(handle, g1, numsets, type_e );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetEdgeData(handle, g1, (void *)edgeptr[0], 0);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        int weight_index = 0;
+        int source_vert = param.source_vert;
+        int sssp_index = 0;
+
+        // run
+        int repeat = std::max((int)(((float)(SSSP_ITER_MULTIPLIER)*STRESS_MULTIPLIER)/(3*n)), 1);
+        //printf ("Repeating C API call for %d times\n", repeat);
+        std::vector<T> calculated_res1(n), calculated_res_mid(n), calculated_res_last(n);
+        size_t free_mid = 0, free_last = 0, total = 0;      
+        for (int i = 0; i < repeat; i++)
+        {
+//            cudaMemGetInfo(&t, &total);
+//            printf("Iteration: %d, freemem: %zu\n", i, t);
+
+            status = nvgraphSssp(handle, g1, weight_index, &source_vert, sssp_index);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+            // all of those should be equal
+            if (i == 0)
+            {
+                status = nvgraphGetVertexData(handle, g1, (void *)&calculated_res1[0], sssp_index);
+                ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            }
+            else
+            {
+                status = nvgraphGetVertexData(handle, g1, (void *)&calculated_res_mid[0], sssp_index);
+                ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+                for (int row = 0; row < n; row++)
+                {
+                    // stronger condition - bit by bit equality
+                    /*
+                    if (calculated_res1[row] != calculated_res_mid[row])
+                    {
+                        typename nvgraph_Const<T>::fpint_st comp1, comp2;
+                        comp1.f = calculated_res1[row];
+                        comp2.f = calculated_res_mid[row];
+                        ASSERT_EQ(comp1.u, comp2.u) << "Difference in result in row #" << row << " graph " << param.graph_file << " for iterations #0 and iteration #" << i;
+                    }
+                    */
+                    ASSERT_NEAR(calculated_res1[row], calculated_res_mid[row], nvgraph_Const<T>::tol) << "Difference in result in row #" << row << " graph " << param.graph_file << " for iterations #0 and iteration #" <<  i;
+                }
+            }
+
+            if (i == std::min(50, (int)(repeat/2)))
+            {
+                cudaMemGetInfo(&free_mid, &total);
+            }
+            if (i == repeat-1)
+            {
+                status = nvgraphGetVertexData(handle, g1, (void *)&calculated_res_last[0], sssp_index);
+                ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+                cudaMemGetInfo(&free_last, &total);
+            }
+        }
+
+        ASSERT_LE(free_mid, free_last) << "Memory difference between iteration #" << std::min(50, (int)(repeat/2)) << " and last iteration is " << (double)(free_last-free_mid)/1e+6 << "MB";
+
+        status = nvgraphDestroyGraphDescr(handle, g1);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    }
+};
+ 
+TEST_P(NVGraphCAPITests_SSSP_Stress, StressDouble)
+{
+    run_current_test<double>(GetParam());
+    
+}
+
+TEST_P(NVGraphCAPITests_SSSP_Stress, StressFloat)
+{
+    run_current_test<float>(GetParam());
+}
+
+
+
+
+class NVGraphCAPITests_Pagerank_Stress : public ::testing::TestWithParam<Pagerank_Usecase> {
+  public:
+    NVGraphCAPITests_Pagerank_Stress() : handle(NULL) {}
+
+    static void SetupTestCase() {}
+    static void TearDownTestCase() {}
+    virtual void SetUp() {
+        //const ::testing::TestInfo* const test_info =::testing::UnitTest::GetInstance()->current_test_info();
+        //printf("We are in test %s of test case %s.\n", test_info->name(), test_info->test_case_name());
+        if (handle == NULL) {
+            status = nvgraphCreate(&handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+    }
+    virtual void TearDown() {
+        if (handle != NULL) {
+            status = nvgraphDestroy(handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            handle = NULL;
+        }
+    }
+    nvgraphStatus_t status;
+    nvgraphHandle_t handle;
+
+    template <typename T>
+    void run_current_test(const Pagerank_Usecase& param)
+    {
+        nvgraphTopologyType_t topo = NVGRAPH_CSC_32;
+
+        nvgraphStatus_t status;
+
+        FILE* fpin = fopen(param.graph_file.c_str(),"rb");
+        ASSERT_TRUE(fpin != NULL) << "Cannot read input graph file: " << param.graph_file << std::endl;
+        int n, nnz;
+        //Read a transposed network in amgx binary format and the bookmark of dangling nodes
+        ASSERT_EQ(read_header_amgx_csr_bin (fpin, n, nnz), 0);
+        std::vector<int> read_row_ptr(n+1), read_col_ind(nnz);
+        std::vector<T> read_val(nnz);
+        std::vector<T> dangling(n);
+        ASSERT_EQ(read_data_amgx_csr_bin_rhs (fpin, n, nnz, read_row_ptr, read_col_ind, read_val, dangling), 0);
+        fclose(fpin);
+
+        const ::testing::TestInfo* const test_info =::testing::UnitTest::GetInstance()->current_test_info();
+
+        if (!enough_device_memory<T>(n, nnz, sizeof(int)*(read_row_ptr.size() + read_col_ind.size())))
+        {
+            std::cout << "[  WAIVED  ] " << test_info->test_case_name() << "." << test_info->name() << std::endl;
+            return;
+        }
+
+        nvgraphGraphDescr_t g1 = NULL;
+        status = nvgraphCreateGraphDescr(handle, &g1);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // set up graph
+        nvgraphCSCTopology32I_st topology = {n, nnz, &read_row_ptr[0], &read_col_ind[0]};
+        status = nvgraphSetGraphStructure(handle, g1, (void*)&topology, topo);
+
+        // set up graph data
+        std::vector<T> calculated_res(n, (T)1.0/n);
+        void*  vertexptr[2] = {(void*)&dangling[0], (void*)&calculated_res[0]};
+        cudaDataType_t type_v[2] = {nvgraph_Const<T>::Type, nvgraph_Const<T>::Type};
+        
+        void*  edgeptr[1] = {(void*)&read_val[0]};
+        cudaDataType_t type_e[1] = {nvgraph_Const<T>::Type};
+
+        status = nvgraphAllocateVertexData(handle, g1, 2, type_v);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetVertexData(handle, g1, vertexptr[0], 0);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetVertexData(handle, g1, vertexptr[1], 1);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphAllocateEdgeData(handle, g1, 1, type_e );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetEdgeData(handle, g1, (void *)edgeptr[0], 0);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        
+        int bookmark_index = 0;
+        int weight_index = 0;
+        T alpha = param.alpha;
+        int pagerank_index = 1;
+        int has_guess = 1;
+        float tolerance = {sizeof(T) > 4 ?  1e-8f :  1e-6f};
+        int max_iter = 1000;
+
+
+        // run
+        int repeat = std::max((int)(((float)(PAGERANK_ITER_MULTIPLIER)*STRESS_MULTIPLIER)/n), 1);
+        //printf ("Repeating C API call for %d times\n", repeat);
+        std::vector<T> calculated_res1(n), calculated_res_mid(n);
+        size_t free_mid = 0, free_last = 0, total = 0;      
+        for (int i = 0; i < repeat; i++)
+        {
+            //cudaMemGetInfo(&t, &total);
+            //printf("Iteration: %d, freemem: %zu\n", i, t);
+
+            status = nvgraphPagerank(handle, g1, weight_index, (void*)&alpha, bookmark_index, has_guess, pagerank_index, tolerance, max_iter);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+            // all of those should be equal
+            if (i == 0)
+            {
+                status = nvgraphGetVertexData(handle, g1, (void *)&calculated_res1[0], pagerank_index);
+                ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            }
+            else
+            {
+                status = nvgraphGetVertexData(handle, g1, (void *)&calculated_res_mid[0], pagerank_index);
+                ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+                for (int row = 0; row < n; row++)
+                {
+                    // stronger condition - bit by bit equality
+                    /*
+                    if (calculated_res1[row] != calculated_res_mid[row])
+                    {
+                        typename nvgraph_Const<T>::fpint_st comp1, comp2;
+                        comp1.f = calculated_res1[row];
+                        comp2.f = calculated_res_mid[row];
+                        ASSERT_EQ(comp1.u, comp2.u) << "Difference in result in row #" << row << " graph " << param.graph_file << " for iterations #0 and iteration #" << i;
+                    }
+                    */
+                    ASSERT_NEAR(calculated_res1[row], calculated_res_mid[row], nvgraph_Const<T>::tol) << "Difference in result in row #" << row << " graph " << param.graph_file << " for iterations #0 and iteration #" <<  i;
+                }
+            }
+
+            if (i == std::min(50, (int)(repeat/2)))
+            {
+                cudaMemGetInfo(&free_mid, &total);
+            }
+            if (i == repeat-1)
+            {
+                cudaMemGetInfo(&free_last, &total);
+            }
+        }
+
+        ASSERT_LE(free_mid, free_last) << "Memory difference between iteration #" << std::min(50, (int)(repeat/2)) << " and last iteration is " << (double)(free_last-free_mid)/1e+6 << "MB";
+
+        status = nvgraphDestroyGraphDescr(handle, g1);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    }
+};
+ 
+TEST_P(NVGraphCAPITests_Pagerank_Stress, StressDouble)
+{
+    run_current_test<double>(GetParam());
+    
+}
+
+TEST_P(NVGraphCAPITests_Pagerank_Stress, StressFloat)
+{
+    run_current_test<float>(GetParam());
+}
+
+
+
+// instatiation of the performance/correctness checks 
+
+INSTANTIATE_TEST_CASE_P(CorrectnessCheck1,
+                        NVGraphCAPITests_SrSPMV,
+                        ::testing::Values(    // maybe check NVGRAPH_OR_AND_SR on some special bool matrices?
+                                              SrSPMV_Usecase("graphs/small/small.bin", NVGRAPH_PLUS_TIMES_SR, 1, 1)
+                                            , SrSPMV_Usecase("graphs/small/small.bin", NVGRAPH_MIN_PLUS_SR, 1, 1)
+                                            , SrSPMV_Usecase("graphs/small/small.bin", NVGRAPH_MAX_MIN_SR, 1, 1)
+                                            , SrSPMV_Usecase("graphs/small/small.bin", NVGRAPH_OR_AND_SR, 1, 1)
+                                            , SrSPMV_Usecase("graphs/dblp/dblp.bin", NVGRAPH_PLUS_TIMES_SR, 0, 0)
+                                            , SrSPMV_Usecase("graphs/dblp/dblp.bin", NVGRAPH_MIN_PLUS_SR, 0, 0)
+                                            , SrSPMV_Usecase("graphs/dblp/dblp.bin", NVGRAPH_MAX_MIN_SR, 0, 0)
+                                            , SrSPMV_Usecase("graphs/dblp/dblp.bin", NVGRAPH_OR_AND_SR, 0, 0)
+                                            , SrSPMV_Usecase("graphs/dblp/dblp.bin", NVGRAPH_PLUS_TIMES_SR, 0, 1)
+                                            , SrSPMV_Usecase("graphs/dblp/dblp.bin", NVGRAPH_MIN_PLUS_SR, 0, 1)
+                                            , SrSPMV_Usecase("graphs/dblp/dblp.bin", NVGRAPH_MAX_MIN_SR, 0, 1)
+                                            , SrSPMV_Usecase("graphs/dblp/dblp.bin", NVGRAPH_OR_AND_SR, 0, 1)
+                                            , SrSPMV_Usecase("graphs/dblp/dblp.bin", NVGRAPH_PLUS_TIMES_SR, 1, 0)
+                                            , SrSPMV_Usecase("graphs/dblp/dblp.bin", NVGRAPH_MIN_PLUS_SR, 1, 0)
+                                            , SrSPMV_Usecase("graphs/dblp/dblp.bin", NVGRAPH_MAX_MIN_SR, 1, 0)
+                                            , SrSPMV_Usecase("graphs/dblp/dblp.bin", NVGRAPH_OR_AND_SR, 1, 0)
+                                            , SrSPMV_Usecase("graphs/dblp/dblp.bin", NVGRAPH_PLUS_TIMES_SR, 1, 1)
+                                            , SrSPMV_Usecase("graphs/dblp/dblp.bin", NVGRAPH_MIN_PLUS_SR, 1, 1)
+                                            , SrSPMV_Usecase("graphs/dblp/dblp.bin", NVGRAPH_MAX_MIN_SR, 1, 1)
+                                            , SrSPMV_Usecase("graphs/dblp/dblp.bin", NVGRAPH_OR_AND_SR, 1, 1)
+                                            , SrSPMV_Usecase("graphs/Wikipedia/2003/wiki2003.bin", NVGRAPH_PLUS_TIMES_SR, 0, 0)
+                                            , SrSPMV_Usecase("graphs/Wikipedia/2003/wiki2003.bin", NVGRAPH_MIN_PLUS_SR, 0, 0)
+                                            , SrSPMV_Usecase("graphs/Wikipedia/2003/wiki2003.bin", NVGRAPH_MAX_MIN_SR, 0, 0)
+                                            , SrSPMV_Usecase("graphs/Wikipedia/2003/wiki2003.bin", NVGRAPH_OR_AND_SR, 0, 0)
+                                            , SrSPMV_Usecase("graphs/Wikipedia/2003/wiki2003.bin", NVGRAPH_PLUS_TIMES_SR, 0, 1)
+                                            , SrSPMV_Usecase("graphs/Wikipedia/2003/wiki2003.bin", NVGRAPH_MIN_PLUS_SR, 0, 1)
+                                            , SrSPMV_Usecase("graphs/Wikipedia/2003/wiki2003.bin", NVGRAPH_MAX_MIN_SR, 0, 1)
+                                            , SrSPMV_Usecase("graphs/Wikipedia/2003/wiki2003.bin", NVGRAPH_OR_AND_SR, 0, 1)
+                                            , SrSPMV_Usecase("graphs/Wikipedia/2003/wiki2003.bin", NVGRAPH_PLUS_TIMES_SR, 1, 0)
+                                            , SrSPMV_Usecase("graphs/Wikipedia/2003/wiki2003.bin", NVGRAPH_MIN_PLUS_SR, 1, 0)
+                                            , SrSPMV_Usecase("graphs/Wikipedia/2003/wiki2003.bin", NVGRAPH_MAX_MIN_SR, 1, 0)
+                                            , SrSPMV_Usecase("graphs/Wikipedia/2003/wiki2003.bin", NVGRAPH_OR_AND_SR, 1, 0)
+                                            , SrSPMV_Usecase("graphs/Wikipedia/2003/wiki2003.bin", NVGRAPH_PLUS_TIMES_SR, 1, 1)
+                                            , SrSPMV_Usecase("graphs/Wikipedia/2003/wiki2003.bin", NVGRAPH_MIN_PLUS_SR, 1, 1)
+                                            , SrSPMV_Usecase("graphs/Wikipedia/2003/wiki2003.bin", NVGRAPH_MAX_MIN_SR, 1, 1)
+                                            , SrSPMV_Usecase("graphs/Wikipedia/2003/wiki2003.bin", NVGRAPH_OR_AND_SR, 1, 1)
+                                            ///// more instances
+                                            )
+                        );
+
+
+INSTANTIATE_TEST_CASE_P(CorrectnessCheck2,
+                        NVGraphCAPITests_SrSPMV,
+                        ::testing::Values(
+                                              SrSPMV_Usecase("graphs/Wikipedia/2011/wiki2011.bin", NVGRAPH_PLUS_TIMES_SR, 0, 0)
+                                            , SrSPMV_Usecase("graphs/Wikipedia/2011/wiki2011.bin", NVGRAPH_MIN_PLUS_SR, 0, 0)
+                                            , SrSPMV_Usecase("graphs/Wikipedia/2011/wiki2011.bin", NVGRAPH_MAX_MIN_SR, 0, 0)
+                                            , SrSPMV_Usecase("graphs/Wikipedia/2011/wiki2011.bin", NVGRAPH_OR_AND_SR, 0, 0)
+                                            , SrSPMV_Usecase("graphs/Wikipedia/2011/wiki2011.bin", NVGRAPH_PLUS_TIMES_SR, 0, 1)
+                                            , SrSPMV_Usecase("graphs/Wikipedia/2011/wiki2011.bin", NVGRAPH_MIN_PLUS_SR, 0, 1)
+                                            , SrSPMV_Usecase("graphs/Wikipedia/2011/wiki2011.bin", NVGRAPH_MAX_MIN_SR, 0, 1)
+                                            , SrSPMV_Usecase("graphs/Wikipedia/2011/wiki2011.bin", NVGRAPH_OR_AND_SR, 0, 1)
+                                            , SrSPMV_Usecase("graphs/Wikipedia/2011/wiki2011.bin", NVGRAPH_MIN_PLUS_SR, 1, 0)
+                                            , SrSPMV_Usecase("graphs/Wikipedia/2011/wiki2011.bin", NVGRAPH_MAX_MIN_SR, 1, 0)
+                                            , SrSPMV_Usecase("graphs/Wikipedia/2011/wiki2011.bin", NVGRAPH_OR_AND_SR, 1, 0)
+                                            , SrSPMV_Usecase("graphs/Wikipedia/2011/wiki2011.bin", NVGRAPH_MIN_PLUS_SR, 1, 1)
+                                            , SrSPMV_Usecase("graphs/Wikipedia/2011/wiki2011.bin", NVGRAPH_MAX_MIN_SR, 1, 1)
+                                            , SrSPMV_Usecase("graphs/Wikipedia/2011/wiki2011.bin", NVGRAPH_OR_AND_SR, 1, 1)
+                                            // these tests fails because of exceeding tolerance: diff = 0.00012826919555664062 vs tol = 9.9999997473787516e-05
+                                            //, SrSPMV_Usecase("graphs/Wikipedia/2011/wiki2011.bin", NVGRAPH_PLUS_TIMES_SR, 1, 1)
+                                            //, SrSPMV_Usecase("graphs/Wikipedia/2011/wiki2011.bin", NVGRAPH_PLUS_TIMES_SR, 1, 0)
+                                            ///// more instances
+                                            )
+                        );
+
+
+INSTANTIATE_TEST_CASE_P(CorrectnessCheck,
+                       NVGraphCAPITests_WidestPath,
+                          //                                  graph FILE                                                 source vert #     file with expected result (in binary?)
+//                                            // we read matrix stored in CSR and pass it as CSC - so matrix is in fact transposed, that's why we compare it to the results calculated on a transposed matrix
+                       ::testing::Values(    
+                                                WidestPath_Usecase("graphs/cage/cage13_T.mtx.bin", 0,   "graphs/cage/cage13.widest_0.bin")
+                                              , WidestPath_Usecase("graphs/cage/cage13_T.mtx.bin", 101, "graphs/cage/cage13.widest_101.bin")
+                                              , WidestPath_Usecase("graphs/cage/cage14_T.mtx.bin", 0,   "graphs/cage/cage14.widest_0.bin")
+                                              , WidestPath_Usecase("graphs/cage/cage14_T.mtx.bin", 101, "graphs/cage/cage14.widest_101.bin")
+                                              // file might be missing on eris
+                                              //, WidestPath_Usecase("graphs/small/small_T.bin", 2,  "graphs/small/small_T.widest_2.bin")
+                                              , WidestPath_Usecase("graphs/dblp/dblp.bin", 100, "graphs/dblp/dblp_T.widest_100.bin")
+                                              , WidestPath_Usecase("graphs/dblp/dblp.bin", 100000, "graphs/dblp/dblp_T.widest_100000.bin")
+                                              , WidestPath_Usecase("graphs/Wikipedia/2003/wiki2003_T.bin", 100,  "graphs/Wikipedia/2003/wiki2003_T.widest_100.bin")
+                                              , WidestPath_Usecase("graphs/Wikipedia/2003/wiki2003_T.bin", 100000, "graphs/Wikipedia/2003/wiki2003_T.widest_100000.bin")
+                                              , WidestPath_Usecase("graphs/citPatents/cit-Patents_T.mtx.bin", 6543, "")
+                                              //, WidestPath_Usecase("dimacs10/kron_g500-logn20_T.mtx.bin", 100000, "")
+                                              //, WidestPath_Usecase("dimacs10/hugetrace-00020_T.mtx.bin", 100000, "")
+                                              //, WidestPath_Usecase("dimacs10/delaunay_n24_T.mtx.bin", 100000, "")
+                                              //, WidestPath_Usecase("dimacs10/road_usa_T.mtx.bin", 100000, "")
+                                              //, WidestPath_Usecase("dimacs10/hugebubbles-00020_T.mtx.bin", 100000, "")
+                                           ///// more instances
+                                           )
+                       );
+
+
+INSTANTIATE_TEST_CASE_P(CorrectnessCheck,
+                        NVGraphCAPITests_SSSP,
+                        //                                  graph FILE                                                  source vert #    file with expected result (in binary?)
+//                                            // we read matrix stored in CSR and pass it as CSC - so matrix is in fact transposed, that's why we compare it to the results calculated on a transposed matrix
+                        ::testing::Values(    
+                                                SSSP_Usecase("graphs/cage/cage13_T.mtx.bin", 0,   "graphs/cage/cage13.sssp_0.bin")
+                                              , SSSP_Usecase("graphs/cage/cage13_T.mtx.bin", 101, "graphs/cage/cage13.sssp_101.bin")
+                                              , SSSP_Usecase("graphs/cage/cage14_T.mtx.bin", 0,   "graphs/cage/cage14.sssp_0.bin")
+                                              , SSSP_Usecase("graphs/cage/cage14_T.mtx.bin", 101, "graphs/cage/cage14.sssp_101.bin")
+                                              , SSSP_Usecase("graphs/small/small.bin", 2, "graphs/small/small.sssp_2.bin")
+                                              , SSSP_Usecase("graphs/dblp/dblp.bin", 100,    "graphs/dblp/dblp_T.sssp_100.bin")
+                                              , SSSP_Usecase("graphs/dblp/dblp.bin", 100000, "graphs/dblp/dblp_T.sssp_100000.bin")
+                                              , SSSP_Usecase("graphs/Wikipedia/2003/wiki2003.bin", 100,    "graphs/Wikipedia/2003/wiki2003_T.sssp_100.bin")
+                                              , SSSP_Usecase("graphs/Wikipedia/2003/wiki2003.bin", 100000, "graphs/Wikipedia/2003/wiki2003_T.sssp_100000.bin")
+                                              , SSSP_Usecase("graphs/citPatents/cit-Patents_T.mtx.bin", 6543, "")
+                                              //, SSSP_Usecase("dimacs10/kron_g500-logn20_T.mtx.bin", 100000, "")
+                                              //, SSSP_Usecase("dimacs10/hugetrace-00020_T.mtx.bin", 100000, "")
+                                              //, SSSP_Usecase("dimacs10/delaunay_n24_T.mtx.bin", 100000, "")
+                                              //, SSSP_Usecase("dimacs10/road_usa_T.mtx.bin", 100000, "")
+                                              //, SSSP_Usecase("dimacs10/hugebubbles-00020_T.mtx.bin", 100000, "")
+                                            ///// more instances
+                                         )
+                        );
+INSTANTIATE_TEST_CASE_P(CorrectnessCheck,
+                        NVGraphCAPITests_Pagerank,
+                        //                                        graph FILE                                                  alpha                file with expected result                                            
+                        ::testing::Values(    
+                                           // Pagerank_Usecase("graphs/small/small_T.bin", 0.85, "graphs/small/small.pagerank_val_0.85.bin"),
+                                            Pagerank_Usecase("graphs/webbase1M/webbase-1M_T.mtx.bin", 0.85, "graphs/webbase1M/webbase-1M.pagerank_val_0.85.bin"),
+                                            Pagerank_Usecase("graphs/webBerkStan/web-BerkStan_T.mtx.bin", 0.85, "graphs/webBerkStan/web-BerkStan.pagerank_val_0.85.bin"),
+                                            Pagerank_Usecase("graphs/webGoogle/web-Google_T.mtx.bin", 0.85, "graphs/webGoogle/web-Google.pagerank_val_0.85.bin"),
+                                            Pagerank_Usecase("graphs/WikiTalk/wiki-Talk_T.mtx.bin", 0.85, "graphs/WikiTalk/wiki-Talk.pagerank_val_0.85.bin"),
+                                            Pagerank_Usecase("graphs/citPatents/cit-Patents_T.mtx.bin", 0.85, "graphs/citPatents/cit-Patents.pagerank_val_0.85.bin"),
+                                            Pagerank_Usecase("graphs/liveJournal/ljournal-2008_T.mtx.bin", 0.85, "graphs/liveJournal/ljournal-2008.pagerank_val_0.85.bin"),
+                                            Pagerank_Usecase("dummy", 0.85, ""),
+                                            Pagerank_Usecase("dimacs10/delaunay_n24_T.mtx.bin", 0.85, ""),
+                                            Pagerank_Usecase("dummy", 0.85, ""), // waived until cublas change, see http://nvbugs/200189611, was: Pagerank_Usecase("dimacs10/hugebubbles-00020_T.mtx.bin", 0.85, ""),
+                                            Pagerank_Usecase("dimacs10/hugetrace-00020_T.mtx.bin", 0.85, "", 10.0),
+                                            Pagerank_Usecase("dimacs10/kron_g500-logn20_T.mtx.bin", 0.85, ""),
+                                            Pagerank_Usecase("dimacs10/road_usa_T.mtx.bin", 0.85, "")
+                                            //Pagerank_Usecase("dimacs10/channel-500x100x100-b050_T.mtx.bin", 0.85, ""),
+                                            //Pagerank_Usecase("dimacs10/coPapersCiteseer_T.mtx.bin", 0.85, "")
+                                            ///// more instances
+                                            )
+                        );
+
+
+//INSTANTIATE_TEST_CASE_P(CorrectnessCheck,
+//                        NVGraphCAPITests_KrylovPagerank,
+//                        //                                        graph FILE                                                  alpha                file with expected result                                            
+//                        ::testing::Values(    
+//                                            //Pagerank_Usecase("graphs/small/small_T.bin", 0.85, "graphs/small/small.pagerank_val_0.85.bin"),
+//                                            Pagerank_Usecase("graphs/webbase1M/webbase-1M_T.mtx.bin", 0.85, "graphs/webbase1M/webbase-1M.pagerank_val_0.85.bin"),
+//                                            Pagerank_Usecase("graphs/webBerkStan/web-BerkStan_T.mtx.bin", 0.85, "graphs/webBerkStan/web-BerkStan.pagerank_val_0.85.bin"),
+//                                            Pagerank_Usecase("graphs/webGoogle/web-Google_T.mtx.bin", 0.85, "graphs/webGoogle/web-Google.pagerank_val_0.85.bin"),
+//                                            Pagerank_Usecase("graphs/WikiTalk/wiki-Talk_T.mtx.bin", 0.85, "graphs/WikiTalk/wiki-Talk.pagerank_val_0.85.bin"),
+//                                            Pagerank_Usecase("graphs/citPatents/cit-Patents_T.mtx.bin", 0.85, "graphs/citPatents/cit-Patents.pagerank_val_0.85.bin"),
+//                                            Pagerank_Usecase("graphs/liveJournal/ljournal-2008_T.mtx.bin", 0.85, "graphs/liveJournal/ljournal-2008.pagerank_val_0.85.bin"),
+//                                            Pagerank_Usecase("dummy", 0.85, ""),
+//                                            Pagerank_Usecase("dimacs10/delaunay_n24_T.mtx.bin", 0.85, ""),
+//                                            Pagerank_Usecase("dimacs10/hugebubbles-00020_T.mtx.bin", 0.85, ""),
+//                                            Pagerank_Usecase("dimacs10/hugetrace-00020_T.mtx.bin", 0.85, "", 10.0),
+//                                            Pagerank_Usecase("dimacs10/kron_g500-logn20_T.mtx.bin", 0.85, ""),
+//                                            Pagerank_Usecase("dimacs10/road_usa_T.mtx.bin", 0.85, "")
+//                                            //Pagerank_Usecase("dimacs10/channel-500x100x100-b050_T.mtx.bin", 0.85, ""),
+//                                            //Pagerank_Usecase("dimacs10/coPapersCiteseer_T.mtx.bin", 0.85, "")
+//                                            ///// more instances
+//                                            )
+//                        );
+
+INSTANTIATE_TEST_CASE_P(StressTest,
+                        NVGraphCAPITests_SrSPMV_Stress,
+                        ::testing::Values(
+                                              SrSPMV_Usecase("graphs/Wikipedia/2011/wiki2011.bin", NVGRAPH_PLUS_TIMES_SR, 1, 1),
+                                              SrSPMV_Usecase("graphs/Wikipedia/2011/wiki2011.bin", NVGRAPH_MIN_PLUS_SR, 1, 1),
+                                              SrSPMV_Usecase("graphs/Wikipedia/2011/wiki2011.bin", NVGRAPH_MAX_MIN_SR, 1, 1),
+                                              SrSPMV_Usecase("graphs/Wikipedia/2011/wiki2011.bin", NVGRAPH_OR_AND_SR, 1, 1)
+                                              )
+                        );
+
+
+INSTANTIATE_TEST_CASE_P(StressTest,
+                        NVGraphCAPITests_Widest_Stress,
+                        ::testing::Values(
+                                              WidestPath_Usecase("graphs/citPatents/cit-Patents_T.mtx.bin", 6543, "")
+                                            )
+                        );
+
+
+INSTANTIATE_TEST_CASE_P(StressTest,
+                        NVGraphCAPITests_SSSP_Stress,
+                        ::testing::Values(
+                                              SSSP_Usecase("graphs/citPatents/cit-Patents_T.mtx.bin", 6543, "")
+                                            )
+                        );
+
+
+INSTANTIATE_TEST_CASE_P(StressTest,
+                        NVGraphCAPITests_Pagerank_Stress,
+                        ::testing::Values(
+                                              Pagerank_Usecase("graphs/citPatents/cit-Patents_T.mtx.bin", 0.7, "")
+                                            )
+                        );
+
+
+int main(int argc, char **argv) 
+{
+
+    for (int i = 0; i < argc; i++)
+    {
+        if (strcmp(argv[i], "--perf") == 0)
+            PERF = 1;
+        if (strcmp(argv[i], "--stress-iters") == 0)
+            STRESS_MULTIPLIER = atoi(argv[i+1]);
+        if (strcmp(argv[i], "--ref-data-dir") == 0)
+            ref_data_prefix = std::string(argv[i+1]);
+        if (strcmp(argv[i], "--graph-data-dir") == 0)
+            graph_data_prefix = std::string(argv[i+1]);
+    }
+    srand(42);
+    ::testing::InitGoogleTest(&argc, argv);
+        
+  return RUN_ALL_TESTS();
+}
diff --git a/cpp/nvgraph/cpp/tests/nvgraph_capi_tests_clustering.cpp b/cpp/nvgraph/cpp/tests/nvgraph_capi_tests_clustering.cpp
new file mode 100644
index 00000000000..24c72f6b20c
--- /dev/null
+++ b/cpp/nvgraph/cpp/tests/nvgraph_capi_tests_clustering.cpp
@@ -0,0 +1,655 @@
+#include <utility>
+#include "gtest/gtest.h"
+#include "nvgraph_test_common.h"
+#include "valued_csr_graph.hxx"
+#include "readMatrix.hxx"
+#include "nvgraphP.h"
+#include "nvgraph.h"
+#include "nvgraph_experimental.h"
+#include "stdlib.h"
+#include <algorithm>
+extern "C" {
+#include "mmio.h"
+}
+#include "mm.hxx"
+
+// do the perf measurements, enabled by command line parameter '--perf'
+static int PERF = 0;
+
+// minimum vertices in the graph to perform perf measurements
+#define PERF_ROWS_LIMIT 1000
+
+// number of repeats = multiplier/num_vertices
+#define PARTITIONER_ITER_MULTIPLIER 1
+#define SELECTOR_ITER_MULTIPLIER 1
+
+// iterations for stress tests = this multiplier * iterations for perf tests
+static int STRESS_MULTIPLIER = 10;
+
+static std::string ref_data_prefix = "";
+static std::string graph_data_prefix = "";
+
+// utility
+template <typename T>
+struct nvgraph_Const;
+
+template <>
+struct nvgraph_Const<double>
+{ 
+    static const cudaDataType_t Type = CUDA_R_64F;
+    static const double inf;
+    static const double tol;
+    typedef union fpint 
+    {
+        double f;
+        unsigned long u;
+    } fpint_st;
+};
+
+const double nvgraph_Const<double>::inf = DBL_MAX;
+const double nvgraph_Const<double>::tol = 1e-6; // this is what we use as a tolerance in the algorithms, more precision than this is useless for CPU reference comparison
+
+template <>
+struct nvgraph_Const<float>
+{ 
+    static const cudaDataType_t Type = CUDA_R_32F;
+    static const float inf;
+    static const float tol;
+
+    typedef union fpint 
+    {
+        float f;
+        unsigned u;
+    } fpint_st;
+
+};
+
+const float nvgraph_Const<float>::inf = FLT_MAX;
+const float nvgraph_Const<float>::tol = 1e-4;
+
+template <typename T>
+bool enough_device_memory(int n, int nnz, size_t add)
+{
+    size_t mtotal, mfree;
+    cudaMemGetInfo(&mfree, &mtotal);
+    if (mfree > add + sizeof(T)*3*(n + nnz)) 
+        return true;
+    return false;
+}
+
+std::string convert_to_local_path(const std::string& in_file)
+{
+    std::string wstr = in_file;
+    if ((wstr != "dummy") & (wstr != ""))
+    {
+        std::string prefix;
+        if (graph_data_prefix.length() > 0)
+        {
+            prefix = graph_data_prefix;
+        }
+        else 
+        {
+#ifdef _WIN32
+            //prefix = "C:\\mnt\\eris\\test\\matrices_collection\\";
+            prefix = "Z:\\matrices_collection\\";
+            std::replace(wstr.begin(), wstr.end(), '/', '\\');
+#else
+            prefix = "/mnt/nvgraph_test_data/";
+#endif
+        }
+        wstr = prefix + wstr;
+    }
+    return wstr;
+}
+
+std::string convert_to_local_path_refdata(const std::string& in_file)
+{
+    std::string wstr = in_file;
+    if ((wstr != "dummy") & (wstr != ""))
+    {
+        std::string prefix;
+        if (ref_data_prefix.length() > 0)
+        {
+            prefix = ref_data_prefix;
+        }
+        else
+        {
+#ifdef _WIN32
+            //prefix = "C:\\mnt\\eris\\test\\ref_data\\";
+            prefix = "Z:\\ref_data\\";
+            std::replace(wstr.begin(), wstr.end(), '/', '\\');
+#else
+            prefix = "/mnt/nvgraph_test_data/ref_data/";
+#endif
+        }
+        wstr = prefix + wstr;
+    }
+    return wstr;
+}
+
+
+/****************************
+* SPECTRAL CLUSTERING
+*****************************/
+
+typedef struct SpectralClustering_Usecase_t
+{
+    std::string graph_file;
+    int clusters;
+    int eigenvalues;
+    nvgraphSpectralClusteringType_t algorithm;
+    nvgraphClusteringMetric_t metric;
+    SpectralClustering_Usecase_t(const std::string& a, int b, int c, nvgraphSpectralClusteringType_t d, nvgraphClusteringMetric_t e) : clusters(b), eigenvalues(c), algorithm(d), metric(e){ graph_file = convert_to_local_path(a);};
+    SpectralClustering_Usecase_t& operator=(const SpectralClustering_Usecase_t& rhs)
+    {
+        graph_file = rhs.graph_file;
+        clusters = rhs.clusters;
+        eigenvalues = rhs.eigenvalues;
+        algorithm = rhs.algorithm;
+        metric = rhs.metric;
+        return *this;
+    }
+} SpectralClustering_Usecase;
+
+
+class NVGraphCAPITests_SpectralClustering : public ::testing::TestWithParam<SpectralClustering_Usecase> {
+  public:
+    NVGraphCAPITests_SpectralClustering() : handle(NULL) {}
+
+    static void SetupTestCase() {}
+    static void TearDownTestCase() {}
+    virtual void SetUp() {
+        if (handle == NULL) {
+            status = nvgraphCreate(&handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+    }
+    virtual void TearDown() {
+        if (handle != NULL) {
+            status = nvgraphDestroy(handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            handle = NULL;
+        }
+    }
+    nvgraphStatus_t status;
+    nvgraphHandle_t handle;
+
+    template <typename T>
+    void run_current_test(const  SpectralClustering_Usecase& param)
+    {
+        const ::testing::TestInfo* const test_info =::testing::UnitTest::GetInstance()->current_test_info();
+        std::stringstream ss; 
+        ss << param.clusters;
+        ss << param.eigenvalues;
+        ss << param.algorithm;
+        ss << param.metric;
+        std::string test_id = std::string(test_info->test_case_name()) + std::string(".") + std::string(test_info->name()) + std::string("_") + getFileName(param.graph_file) + std::string("_") + ss.str().c_str();
+
+        nvgraphStatus_t status;
+        int m, n, nnz;
+        MM_typecode mc;
+
+        FILE* fpin = fopen(param.graph_file.c_str(),"r");
+        ASSERT_TRUE(fpin != NULL) << "Cannot read input graph file: " << param.graph_file << std::endl;
+        
+        ASSERT_EQ(mm_properties<int>(fpin, 1, &mc, &m, &n, &nnz),0) << "could not read Matrix Market file properties"<< "\n";
+
+        ASSERT_TRUE(mm_is_matrix(mc));
+        ASSERT_TRUE(mm_is_coordinate(mc));
+        ASSERT_TRUE(m==n);
+        ASSERT_FALSE(mm_is_complex(mc));
+        ASSERT_FALSE(mm_is_skew(mc));
+
+        // Allocate memory on host
+        std::vector<int> cooRowIndA(nnz);
+        std::vector<int> csrColIndA(nnz);
+        std::vector<int> csrRowPtrA(n+1);
+        std::vector<T> csrValA(nnz);
+
+        ASSERT_EQ( (mm_to_coo<int,T>(fpin, 1, nnz, &cooRowIndA[0], &csrColIndA[0], &csrValA[0], NULL)) , 0)<< "could not read matrix data"<< "\n";
+        ASSERT_EQ( (coo_to_csr<int,T> (n, n, nnz, &cooRowIndA[0],  &csrColIndA[0], &csrValA[0], NULL, &csrRowPtrA[0], NULL, NULL, NULL)), 0) << "could not covert COO to CSR "<< "\n";
+
+        ASSERT_EQ(fclose(fpin),0);
+        //ASSERT_TRUE(fpin != NULL) << "Cannot read input graph file: " << param.graph_file << std::endl;
+        
+        int *clustering_d;
+
+         
+        if (!enough_device_memory<T>(n, nnz, sizeof(int)*(csrRowPtrA.size() + csrColIndA.size())))
+        {
+            std::cout << "[  WAIVED  ] " << test_info->test_case_name() << "." << test_info->name() << std::endl;
+            return;
+        }
+
+        cudaMalloc((void**)&clustering_d , n*sizeof(int));
+
+        nvgraphGraphDescr_t g1 = NULL;
+        status = nvgraphCreateGraphDescr(handle, &g1);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // set up graph
+        nvgraphCSRTopology32I_st topology = {n, nnz, &csrRowPtrA[0], &csrColIndA[0]};
+        status = nvgraphSetGraphStructure(handle, g1, (void*)&topology, NVGRAPH_CSR_32);
+
+        // set up graph data
+        size_t numsets = 1;
+        
+        void*  edgeptr[1] = {(void*)&csrValA[0]};
+        cudaDataType_t type_e[1] = {nvgraph_Const<T>::Type};
+
+        status = nvgraphAllocateEdgeData(handle, g1, numsets, type_e );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetEdgeData(handle, g1, (void *)edgeptr[0], 0 );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        int weight_index = 0;
+        struct SpectralClusteringParameter clustering_params;
+        clustering_params.n_clusters = param.clusters; 
+        clustering_params.n_eig_vects = param.eigenvalues; 
+        clustering_params.algorithm = param.algorithm;
+        clustering_params.evs_tolerance = 0.0f ;
+        clustering_params.evs_max_iter = 0;
+        clustering_params.kmean_tolerance = 0.0f; 
+        clustering_params.kmean_max_iter = 0;
+
+        std::vector<int> random_assignments_h(n);
+        std::vector<T> eigVals_h(param.eigenvalues);
+        std::vector<T> eigVecs_h(n*param.eigenvalues);
+        float score = 0.0, random_score = 0.0;     
+
+        if (PERF && n > PERF_ROWS_LIMIT)
+        {
+            double start, stop;
+            start = second();
+            int repeat = std::max((int)((float)(PARTITIONER_ITER_MULTIPLIER)/n), 1);
+            for (int i = 0; i < repeat; i++)
+                status =nvgraphSpectralClustering(handle, g1, weight_index, &clustering_params, clustering_d, &eigVals_h[0], &eigVecs_h[0]);
+            stop = second();
+            printf("&&&& PERF Time_%s %10.8f -ms\n", test_id.c_str(), 1000.0*(stop-start)/repeat);
+        }
+        else
+           status =nvgraphSpectralClustering(handle, g1, weight_index, &clustering_params, clustering_d, &eigVals_h[0], &eigVecs_h[0]);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+       
+        // Analyse quality
+        status = nvgraphAnalyzeClustering(handle, g1, weight_index,  param.clusters, clustering_d,  param.metric, &score);  
+
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        //printf("Score = %f\n", score);
+
+        // ===
+        // Synthetic random 
+        
+        for (int i=0; i<n; i++)
+        {
+            random_assignments_h[i] = rand() % param.clusters;
+            //printf("%d ", random_assignments_h[i]);
+        }
+
+        status = nvgraphAnalyzeClustering(handle, g1, weight_index,  param.clusters, &random_assignments_h[0],  param.metric, &random_score);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        //printf("Random modularity = %f\n", modularity2);
+        if (param.metric == NVGRAPH_MODULARITY)
+            EXPECT_GE(score, random_score); // we want higher modularity
+        else
+             EXPECT_GE(random_score, score); //we want less edge cut
+
+        cudaFree(clustering_d);
+        status = nvgraphDestroyGraphDescr(handle, g1);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    }
+};
+ 
+TEST_P(NVGraphCAPITests_SpectralClustering, CheckResultDouble)
+{
+    run_current_test<double>(GetParam());
+    
+}
+
+TEST_P(NVGraphCAPITests_SpectralClustering, CheckResultFloat)
+{
+    run_current_test<float>(GetParam());
+}
+
+// --gtest_filter=*ModularityCorrectness*
+INSTANTIATE_TEST_CASE_P(SpectralModularityCorrectnessCheck,
+                       NVGraphCAPITests_SpectralClustering,
+                          //                                  graph FILE        number of clusters #     number of eigenvalues #     
+                       ::testing::Values(    
+                                            SpectralClustering_Usecase("dimacs10/delaunay_n10.mtx", 2, 2, NVGRAPH_MODULARITY_MAXIMIZATION, NVGRAPH_MODULARITY),
+                                            SpectralClustering_Usecase("dimacs10/delaunay_n10.mtx", 3, 3, NVGRAPH_MODULARITY_MAXIMIZATION, NVGRAPH_MODULARITY),
+                                            SpectralClustering_Usecase("dimacs10/uk.mtx", 2, 2, NVGRAPH_MODULARITY_MAXIMIZATION, NVGRAPH_MODULARITY),
+                                            SpectralClustering_Usecase("dimacs10/uk.mtx", 3, 3, NVGRAPH_MODULARITY_MAXIMIZATION, NVGRAPH_MODULARITY),
+                                            SpectralClustering_Usecase("dimacs10/data.mtx", 3, 3, NVGRAPH_MODULARITY_MAXIMIZATION, NVGRAPH_MODULARITY),
+                                            SpectralClustering_Usecase("dimacs10/data.mtx", 5, 5, NVGRAPH_MODULARITY_MAXIMIZATION, NVGRAPH_MODULARITY),
+                                            SpectralClustering_Usecase("dimacs10/data.mtx", 7, 7, NVGRAPH_MODULARITY_MAXIMIZATION, NVGRAPH_MODULARITY),
+                                            SpectralClustering_Usecase("dimacs10/cti.mtx", 3, 3,NVGRAPH_MODULARITY_MAXIMIZATION, NVGRAPH_MODULARITY),
+                                            SpectralClustering_Usecase("dimacs10/cti.mtx", 5, 5,NVGRAPH_MODULARITY_MAXIMIZATION, NVGRAPH_MODULARITY),
+                                            SpectralClustering_Usecase("dimacs10/cti.mtx", 7, 7,NVGRAPH_MODULARITY_MAXIMIZATION, NVGRAPH_MODULARITY)
+                                           ///// more instances
+                                         )
+                       );
+
+// --gtest_filter=*ModularityCorner*
+INSTANTIATE_TEST_CASE_P(SpectralModularityCornerCheck,
+                       NVGraphCAPITests_SpectralClustering,
+                          //                                  graph FILE        number of clusters #     number of eigenvalues #     
+                       ::testing::Values(  
+                                           SpectralClustering_Usecase("dimacs10/delaunay_n10.mtx", 7, 4, NVGRAPH_BALANCED_CUT_LANCZOS, NVGRAPH_MODULARITY),
+                                           SpectralClustering_Usecase("dimacs10/uk.mtx", 7, 4, NVGRAPH_BALANCED_CUT_LANCZOS, NVGRAPH_MODULARITY),
+                                           SpectralClustering_Usecase("dimacs10/delaunay_n12.mtx", 7, 4, NVGRAPH_BALANCED_CUT_LANCZOS, NVGRAPH_MODULARITY),
+                                           SpectralClustering_Usecase("dimacs10/data.mtx", 7, 4, NVGRAPH_BALANCED_CUT_LANCZOS, NVGRAPH_MODULARITY),
+                                           SpectralClustering_Usecase("dimacs10/cti.mtx", 7, 4,NVGRAPH_BALANCED_CUT_LANCZOS, NVGRAPH_MODULARITY),  
+                                           SpectralClustering_Usecase("dimacs10/citationCiteseer.mtx", 7, 4, NVGRAPH_MODULARITY_MAXIMIZATION, NVGRAPH_MODULARITY),
+                                           SpectralClustering_Usecase("dimacs10/citationCiteseer.mtx", 17, 7, NVGRAPH_MODULARITY_MAXIMIZATION, NVGRAPH_MODULARITY)
+                                           // tests cases on coAuthorsDBLP may diverge on some cards (likely due to different normalization operation)
+                                           //SpectralClustering_Usecase("dimacs10/coAuthorsDBLP.mtx", 7, 4,NVGRAPH_MODULARITY_MAXIMIZATION, NVGRAPH_MODULARITY),
+                                           //SpectralClustering_Usecase("dimacs10/coAuthorsDBLP.mtx", 17, 7,NVGRAPH_MODULARITY_MAXIMIZATION, NVGRAPH_MODULARITY)
+                                           ///// more instances
+                                         )
+                       );
+// --gtest_filter=*LanczosBlancedCutCorrectness*
+INSTANTIATE_TEST_CASE_P(SpectralLanczosBlancedCutCorrectnessCheck,
+                       NVGraphCAPITests_SpectralClustering,
+                          //                                  graph FILE       number of clusters #     number of eigenvalues #   
+                       ::testing::Values( 
+                                           SpectralClustering_Usecase("dimacs10/delaunay_n10.mtx", 2, 2,NVGRAPH_BALANCED_CUT_LANCZOS, NVGRAPH_RATIO_CUT),
+                                           SpectralClustering_Usecase("dimacs10/delaunay_n10.mtx", 3, 3, NVGRAPH_BALANCED_CUT_LANCZOS, NVGRAPH_RATIO_CUT),
+                                           SpectralClustering_Usecase("dimacs10/delaunay_n10.mtx", 4, 4, NVGRAPH_BALANCED_CUT_LANCZOS, NVGRAPH_RATIO_CUT),
+                                           SpectralClustering_Usecase("dimacs10/delaunay_n10.mtx", 7, 7, NVGRAPH_BALANCED_CUT_LANCZOS, NVGRAPH_RATIO_CUT),
+                                           SpectralClustering_Usecase("dimacs10/uk.mtx", 7, 7, NVGRAPH_BALANCED_CUT_LANCZOS, NVGRAPH_RATIO_CUT),
+                                           SpectralClustering_Usecase("dimacs10/delaunay_n12.mtx", 7, 7, NVGRAPH_BALANCED_CUT_LANCZOS, NVGRAPH_RATIO_CUT),
+                                           SpectralClustering_Usecase("dimacs10/data.mtx", 7, 7, NVGRAPH_BALANCED_CUT_LANCZOS, NVGRAPH_RATIO_CUT),
+                                           SpectralClustering_Usecase("dimacs10/cti.mtx", 3, 3, NVGRAPH_BALANCED_CUT_LANCZOS, NVGRAPH_RATIO_CUT),
+                                           SpectralClustering_Usecase("dimacs10/cti.mtx", 7, 7, NVGRAPH_BALANCED_CUT_LANCZOS, NVGRAPH_RATIO_CUT)
+                                           ///// more instances
+                                         )
+                       );
+// --gtest_filter=*LanczosBlancedCutCorner*
+INSTANTIATE_TEST_CASE_P(SpectralLanczosBlancedCutCornerCheck,
+                       NVGraphCAPITests_SpectralClustering,
+                          //                                  graph FILE        number of clusters #     number of eigenvalues #     
+                       ::testing::Values(    
+                                           
+                                           SpectralClustering_Usecase("dimacs10/delaunay_n10.mtx", 7, 4, NVGRAPH_BALANCED_CUT_LANCZOS, NVGRAPH_EDGE_CUT),
+                                           SpectralClustering_Usecase("dimacs10/uk.mtx", 7, 4, NVGRAPH_BALANCED_CUT_LANCZOS, NVGRAPH_EDGE_CUT),
+                                           SpectralClustering_Usecase("dimacs10/delaunay_n12.mtx", 7, 4, NVGRAPH_BALANCED_CUT_LANCZOS, NVGRAPH_EDGE_CUT),
+                                           SpectralClustering_Usecase("dimacs10/data.mtx", 7, 4, NVGRAPH_BALANCED_CUT_LANCZOS, NVGRAPH_EDGE_CUT),
+                                           SpectralClustering_Usecase("dimacs10/cti.mtx", 7, 4,NVGRAPH_BALANCED_CUT_LANCZOS, NVGRAPH_EDGE_CUT),
+                                           SpectralClustering_Usecase("dimacs10/citationCiteseer.mtx", 7, 4, NVGRAPH_BALANCED_CUT_LANCZOS, NVGRAPH_EDGE_CUT),
+                                           SpectralClustering_Usecase("dimacs10/citationCiteseer.mtx", 17, 7, NVGRAPH_BALANCED_CUT_LANCZOS, NVGRAPH_EDGE_CUT)
+                                           // tests cases on coAuthorsDBLP may diverge on some cards (likely due to different normalization operation)
+                                           //SpectralClustering_Usecase("dimacs10/coAuthorsDBLP.mtx", 7, 4,NVGRAPH_BALANCED_CUT_LANCZOS, NVGRAPH_EDGE_CUT),
+                                           //SpectralClustering_Usecase("dimacs10/coAuthorsDBLP.mtx", 17, 7,NVGRAPH_BALANCED_CUT_LANCZOS, NVGRAPH_EDGE_CUT)                       
+                                         )
+                       );
+
+// --gtest_filter=*LobpcgBlancedCutCorrectness*
+INSTANTIATE_TEST_CASE_P(SpectralLobpcgBlancedCutCorrectnessCheck,
+                       NVGraphCAPITests_SpectralClustering,
+                          //                                  graph FILE       number of clusters #     number of eigenvalues #   
+                       ::testing::Values(   
+                                           SpectralClustering_Usecase("dimacs10/delaunay_n10.mtx", 2, 2,NVGRAPH_BALANCED_CUT_LOBPCG, NVGRAPH_RATIO_CUT),
+                                           SpectralClustering_Usecase("dimacs10/delaunay_n10.mtx", 3, 3, NVGRAPH_BALANCED_CUT_LOBPCG, NVGRAPH_RATIO_CUT),
+                                           SpectralClustering_Usecase("dimacs10/delaunay_n10.mtx", 4, 4, NVGRAPH_BALANCED_CUT_LOBPCG, NVGRAPH_RATIO_CUT),
+                                           SpectralClustering_Usecase("dimacs10/delaunay_n10.mtx", 7, 7, NVGRAPH_BALANCED_CUT_LOBPCG, NVGRAPH_RATIO_CUT),
+                                           SpectralClustering_Usecase("dimacs10/uk.mtx", 7, 7, NVGRAPH_BALANCED_CUT_LOBPCG, NVGRAPH_RATIO_CUT),
+                                           SpectralClustering_Usecase("dimacs10/delaunay_n12.mtx", 7, 7, NVGRAPH_BALANCED_CUT_LOBPCG, NVGRAPH_RATIO_CUT),
+                                           SpectralClustering_Usecase("dimacs10/cti.mtx", 3, 3, NVGRAPH_BALANCED_CUT_LOBPCG, NVGRAPH_RATIO_CUT),
+                                           SpectralClustering_Usecase("dimacs10/cti.mtx", 7, 7, NVGRAPH_BALANCED_CUT_LOBPCG, NVGRAPH_RATIO_CUT)
+                                           ///// more instances
+                                         )
+                       );
+// --gtest_filter=*LobpcgBlancedCutCorner*
+INSTANTIATE_TEST_CASE_P(SpectralLobpcgBlancedCutCornerCheck,
+                       NVGraphCAPITests_SpectralClustering,
+                          //                                  graph FILE        number of clusters #     number of eigenvalues #     
+                       ::testing::Values(                                              
+                                           SpectralClustering_Usecase("dimacs10/delaunay_n10.mtx", 7, 4, NVGRAPH_BALANCED_CUT_LOBPCG, NVGRAPH_EDGE_CUT),
+                                           SpectralClustering_Usecase("dimacs10/uk.mtx", 7, 4, NVGRAPH_BALANCED_CUT_LOBPCG, NVGRAPH_EDGE_CUT),
+                                           SpectralClustering_Usecase("dimacs10/delaunay_n12.mtx", 7, 4, NVGRAPH_BALANCED_CUT_LOBPCG, NVGRAPH_EDGE_CUT),
+                                           SpectralClustering_Usecase("dimacs10/data.mtx", 7, 4, NVGRAPH_BALANCED_CUT_LOBPCG, NVGRAPH_EDGE_CUT),
+                                           SpectralClustering_Usecase("dimacs10/cti.mtx", 7, 4,NVGRAPH_BALANCED_CUT_LOBPCG, NVGRAPH_EDGE_CUT),
+                                           SpectralClustering_Usecase("dimacs10/citationCiteseer.mtx", 7, 4, NVGRAPH_BALANCED_CUT_LOBPCG, NVGRAPH_EDGE_CUT),
+                                           SpectralClustering_Usecase("dimacs10/citationCiteseer.mtx", 17, 7, NVGRAPH_BALANCED_CUT_LOBPCG, NVGRAPH_EDGE_CUT)
+                                           // tests cases on coAuthorsDBLP may diverge on some cards (likely due to different normalization operation)
+                                           //SpectralClustering_Usecase("dimacs10/coAuthorsDBLP.mtx", 7, 4,NVGRAPH_BALANCED_CUT_LOBPCG, NVGRAPH_EDGE_CUT),
+                                           //SpectralClustering_Usecase("dimacs10/coAuthorsDBLP.mtx", 17, 7,NVGRAPH_BALANCED_CUT_LOBPCG, NVGRAPH_EDGE_CUT)
+                                           ///// more instances
+                                         )
+                       );
+//Followinf tests were commented becasue they are a bit redundent and quite long to run
+// previous tests already contain dataset with 1 million edges
+
+//// --gtest_filter=*ModularityLargeCorrectness*
+//INSTANTIATE_TEST_CASE_P(SpectralModularityLargeCorrectnessCheck,
+//                       NVGraphCAPITests_SpectralClustering,
+//                          //                                  graph FILE        number of clusters #     number of eigenvalues #     
+//                       ::testing::Values(    
+//                                            SpectralClustering_Usecase("dimacs10/citationCiteseer.mtx", 2, 2, NVGRAPH_MODULARITY_MAXIMIZATION, NVGRAPH_MODULARITY),
+//                                            SpectralClustering_Usecase("dimacs10/citationCiteseer.mtx", 3, 3, NVGRAPH_MODULARITY_MAXIMIZATION, NVGRAPH_MODULARITY),
+//                                            SpectralClustering_Usecase("dimacs10/citationCiteseer.mtx", 5, 5, NVGRAPH_MODULARITY_MAXIMIZATION, NVGRAPH_MODULARITY),
+//                                            SpectralClustering_Usecase("dimacs10/citationCiteseer.mtx", 7, 7, NVGRAPH_MODULARITY_MAXIMIZATION, NVGRAPH_MODULARITY),
+//                                            SpectralClustering_Usecase("dimacs10/coAuthorsDBLP.mtx", 2, 2, NVGRAPH_MODULARITY_MAXIMIZATION, NVGRAPH_MODULARITY),
+//                                            SpectralClustering_Usecase("dimacs10/coAuthorsDBLP.mtx", 3, 3, NVGRAPH_MODULARITY_MAXIMIZATION, NVGRAPH_MODULARITY),
+//                                            SpectralClustering_Usecase("dimacs10/coAuthorsDBLP.mtx", 5, 5, NVGRAPH_MODULARITY_MAXIMIZATION, NVGRAPH_MODULARITY),
+//                                            SpectralClustering_Usecase("dimacs10/coAuthorsDBLP.mtx", 7, 7, NVGRAPH_MODULARITY_MAXIMIZATION, NVGRAPH_MODULARITY)
+//                                          ///// more instances
+//                                         )
+//                       );
+//
+//// --gtest_filter=*LanczosBlancedCutLargeCorrectness*
+//INSTANTIATE_TEST_CASE_P(SpectralLanczosBlancedCutLargeCorrectnessCheck,
+//                       NVGraphCAPITests_SpectralClustering,
+//                          //                                  graph FILE       number of clusters #     number of eigenvalues #   
+//                       ::testing::Values(    
+//                                           SpectralClustering_Usecase("dimacs10/citationCiteseer.mtx", 2, 2, NVGRAPH_BALANCED_CUT_LANCZOS, NVGRAPH_RATIO_CUT),
+//                                           SpectralClustering_Usecase("dimacs10/citationCiteseer.mtx", 3, 3, NVGRAPH_BALANCED_CUT_LANCZOS, NVGRAPH_RATIO_CUT),
+//                                           SpectralClustering_Usecase("dimacs10/citationCiteseer.mtx", 5, 5, NVGRAPH_BALANCED_CUT_LANCZOS, NVGRAPH_RATIO_CUT),
+//                                           SpectralClustering_Usecase("dimacs10/citationCiteseer.mtx", 7, 7, NVGRAPH_BALANCED_CUT_LANCZOS, NVGRAPH_RATIO_CUT),
+//                                           SpectralClustering_Usecase("dimacs10/coAuthorsDBLP.mtx", 2, 2, NVGRAPH_BALANCED_CUT_LANCZOS,NVGRAPH_RATIO_CUT),
+//                                           SpectralClustering_Usecase("dimacs10/coAuthorsDBLP.mtx", 3, 3, NVGRAPH_BALANCED_CUT_LANCZOS,NVGRAPH_RATIO_CUT),
+//                                           SpectralClustering_Usecase("dimacs10/coAuthorsDBLP.mtx", 5, 5, NVGRAPH_BALANCED_CUT_LANCZOS,NVGRAPH_RATIO_CUT),
+//                                           SpectralClustering_Usecase("dimacs10/coAuthorsDBLP.mtx", 7, 7, NVGRAPH_BALANCED_CUT_LANCZOS,NVGRAPH_RATIO_CUT)
+//                                         )
+//                       );
+//// --gtest_filter=*LobpcgBlancedCutLargeCorrectness*
+//INSTANTIATE_TEST_CASE_P(SpectralLobpcgBlancedCutLargeCorrectnessCheck,
+//                       NVGraphCAPITests_SpectralClustering,
+//                          //                                  graph FILE       number of clusters #     number of eigenvalues #   
+//                       ::testing::Values(    
+//                                           //SpectralClustering_Usecase("dimacs10/citationCiteseer.mtx", 2, 2, NVGRAPH_BALANCED_CUT_LOBPCG, NVGRAPH_RATIO_CUT),
+//                                           SpectralClustering_Usecase("dimacs10/citationCiteseer.mtx", 3, 3, NVGRAPH_BALANCED_CUT_LOBPCG, NVGRAPH_RATIO_CUT),
+//                                           SpectralClustering_Usecase("dimacs10/citationCiteseer.mtx", 5, 5, NVGRAPH_BALANCED_CUT_LOBPCG, NVGRAPH_RATIO_CUT),
+//                                           SpectralClustering_Usecase("dimacs10/citationCiteseer.mtx", 7, 7, NVGRAPH_BALANCED_CUT_LOBPCG, NVGRAPH_RATIO_CUT),
+//                                           SpectralClustering_Usecase("dimacs10/coAuthorsDBLP.mtx", 2, 2, NVGRAPH_BALANCED_CUT_LOBPCG, NVGRAPH_RATIO_CUT),
+//                                           SpectralClustering_Usecase("dimacs10/coAuthorsDBLP.mtx", 3, 3, NVGRAPH_BALANCED_CUT_LOBPCG, NVGRAPH_RATIO_CUT),
+//                                           SpectralClustering_Usecase("dimacs10/coAuthorsDBLP.mtx", 5, 5, NVGRAPH_BALANCED_CUT_LOBPCG, NVGRAPH_RATIO_CUT),
+//                                           SpectralClustering_Usecase("dimacs10/coAuthorsDBLP.mtx", 7, 7, NVGRAPH_BALANCED_CUT_LOBPCG, NVGRAPH_RATIO_CUT)
+//                                         )
+//                       );
+/****************************
+* SELECTOR
+*****************************/
+
+typedef struct Selector_Usecase_t
+{
+    std::string graph_file;
+    nvgraphEdgeWeightMatching_t metric;
+    Selector_Usecase_t(const std::string& a, nvgraphEdgeWeightMatching_t b) : metric(b){ graph_file = convert_to_local_path(a);};
+    Selector_Usecase_t& operator=(const Selector_Usecase_t& rhs)
+    {
+        graph_file = rhs.graph_file;
+        metric = rhs.metric;
+        return *this;
+    }
+}Selector_Usecase;
+
+class NVGraphCAPITests_Selector : public ::testing::TestWithParam<Selector_Usecase> {
+  public:
+    NVGraphCAPITests_Selector() : handle(NULL) {}
+
+    static void SetupTestCase() {}
+    static void TearDownTestCase() {}
+    virtual void SetUp() {
+        if (handle == NULL) {
+            status = nvgraphCreate(&handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+    }
+    virtual void TearDown() {
+        if (handle != NULL) {
+            status = nvgraphDestroy(handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            handle = NULL;
+        }
+    }
+    nvgraphStatus_t status;
+    nvgraphHandle_t handle;
+
+    template <typename T>
+    void run_current_test(const Selector_Usecase& param)
+    {
+        const ::testing::TestInfo* const test_info =::testing::UnitTest::GetInstance()->current_test_info();
+        std::stringstream ss; 
+        ss << param.metric;
+        std::string test_id = std::string(test_info->test_case_name()) + std::string(".") + std::string(test_info->name()) + std::string("_") + getFileName(param.graph_file)+ std::string("_") + ss.str().c_str();
+
+        nvgraphStatus_t status;
+        int m, n, nnz;
+        MM_typecode mc;
+
+        FILE* fpin = fopen(param.graph_file.c_str(),"r");
+        ASSERT_TRUE(fpin != NULL) << "Cannot read input graph file: " << param.graph_file << std::endl;
+        
+        ASSERT_EQ(mm_properties<int>(fpin, 1, &mc, &m, &n, &nnz),0) << "could not read Matrix Market file properties"<< "\n";
+
+        ASSERT_TRUE(mm_is_matrix(mc));
+        ASSERT_TRUE(mm_is_coordinate(mc));
+        ASSERT_TRUE(m==n);
+        ASSERT_FALSE(mm_is_complex(mc));
+        ASSERT_FALSE(mm_is_skew(mc));
+
+        // Allocate memory on host
+        std::vector<int> cooRowIndA(nnz);
+        std::vector<int> csrColIndA(nnz);
+        std::vector<int> csrRowPtrA(n+1);
+        std::vector<T> csrValA(nnz);
+
+        ASSERT_EQ( (mm_to_coo<int,T>(fpin, 1, nnz, &cooRowIndA[0], &csrColIndA[0], &csrValA[0], NULL)) , 0)<< "could not read matrix data"<< "\n";
+        ASSERT_EQ( (coo_to_csr<int,T> (n, n, nnz, &cooRowIndA[0],  &csrColIndA[0], &csrValA[0], NULL, &csrRowPtrA[0], NULL, NULL, NULL)), 0) << "could not covert COO to CSR "<< "\n";
+
+        ASSERT_EQ(fclose(fpin),0);
+        //ASSERT_TRUE(fpin != NULL) << "Cannot read input graph file: " << param.graph_file << std::endl;
+                 
+        if (!enough_device_memory<T>(n, nnz, sizeof(int)*(csrRowPtrA.size() + csrColIndA.size())))
+        {
+            std::cout << "[  WAIVED  ] " << test_info->test_case_name() << "." << test_info->name() << std::endl;
+            return;
+        }
+        //int *aggregates_d;
+        //cudaMalloc((void**)&aggregates_d , n*sizeof(int));
+
+        nvgraphGraphDescr_t g1 = NULL;
+        status = nvgraphCreateGraphDescr(handle, &g1);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // set up graph
+        nvgraphCSRTopology32I_st topology = {n, nnz, &csrRowPtrA[0], &csrColIndA[0]};
+        status = nvgraphSetGraphStructure(handle, g1, (void*)&topology, NVGRAPH_CSR_32);
+
+        // set up graph data
+        size_t numsets = 1;
+        //void*  vertexptr[1] = {(void*)&calculated_res[0]};
+        //cudaDataType_t type_v[1] = {nvgraph_Const<T>::Type};
+        
+        void*  edgeptr[1] = {(void*)&csrValA[0]};
+        cudaDataType_t type_e[1] = {nvgraph_Const<T>::Type};
+
+        //status = nvgraphAllocateVertexData(handle, g1, numsets, type_v);
+        //ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        //status = nvgraphSetVertexData(handle, g1, vertexptr[0], 0, NVGRAPH_CSR_32 );
+        //ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphAllocateEdgeData(handle, g1, numsets, type_e );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetEdgeData(handle, g1, (void *)edgeptr[0], 0 );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        int weight_index = 0;
+        std::vector<int> aggregates_h(n);
+        //std::vector<int> aggregates_global_h(n);
+         size_t num_aggregates;
+         size_t *num_aggregates_ptr = &num_aggregates;
+
+        status =  nvgraphHeavyEdgeMatching(handle,  g1,  weight_index, param.metric, &aggregates_h[0],  num_aggregates_ptr);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        std::cout  << "n = " << n << ", num aggregates = " << num_aggregates << std::endl;
+        
+        if (param.metric == NVGRAPH_SCALED_BY_DIAGONAL)
+            EXPECT_EQ(num_aggregates, static_cast<size_t>(166)); // comparing against amgx result on poisson2D.mtx
+        else        
+            EXPECT_LE(num_aggregates, static_cast<size_t>(n)); // just make sure the output make sense
+        
+        //for (int i=0; i<n; i++)
+        //{
+        //    printf("%d\n", aggregates_h[i]);
+        //}
+
+        status = nvgraphDestroyGraphDescr(handle, g1);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    }
+};
+ 
+TEST_P(NVGraphCAPITests_Selector, CheckResultDouble)
+{
+    run_current_test<double>(GetParam());
+    
+}
+
+TEST_P(NVGraphCAPITests_Selector, CheckResultFloat)
+{
+    run_current_test<float>(GetParam());
+}
+
+// --gtest_filter=*Correctness*
+INSTANTIATE_TEST_CASE_P(SmallCorrectnessCheck,
+                       NVGraphCAPITests_Selector,
+                          //                                  graph FILE     SIMILARITY_METRIC
+                       ::testing::Values(    
+                                            Selector_Usecase("Florida/poisson2D.mtx", NVGRAPH_SCALED_BY_DIAGONAL),
+                                            Selector_Usecase("dimacs10/delaunay_n10.mtx", NVGRAPH_SCALED_BY_ROW_SUM),
+                                            Selector_Usecase("dimacs10/delaunay_n10.mtx", NVGRAPH_UNSCALED),
+                                            Selector_Usecase("dimacs10/uk.mtx", NVGRAPH_SCALED_BY_ROW_SUM),
+                                            Selector_Usecase("dimacs10/uk.mtx", NVGRAPH_UNSCALED),
+                                            Selector_Usecase("dimacs10/data.mtx", NVGRAPH_SCALED_BY_ROW_SUM),
+                                            Selector_Usecase("dimacs10/data.mtx", NVGRAPH_UNSCALED),
+                                            Selector_Usecase("dimacs10/cti.mtx", NVGRAPH_SCALED_BY_ROW_SUM),
+                                            Selector_Usecase("dimacs10/cti.mtx", NVGRAPH_UNSCALED)
+                                           ///// more instances
+                                         )
+                       );
+
+int main(int argc, char **argv) 
+{
+    srand(42);
+    ::testing::InitGoogleTest(&argc, argv);
+    for (int i = 0; i < argc; i++)
+    {
+        if (strcmp(argv[i], "--perf") == 0)
+            PERF = 1;
+        if (strcmp(argv[i], "--stress-iters") == 0)
+            STRESS_MULTIPLIER = atoi(argv[i+1]);
+        if (strcmp(argv[i], "--ref-data-dir") == 0)
+            ref_data_prefix = std::string(argv[i+1]);
+        if (strcmp(argv[i], "--graph-data-dir") == 0)
+            graph_data_prefix = std::string(argv[i+1]);
+
+    }
+        
+  return RUN_ALL_TESTS();
+
+    return 0;
+}
+
+
diff --git a/cpp/nvgraph/cpp/tests/nvgraph_capi_tests_contraction.cpp b/cpp/nvgraph/cpp/tests/nvgraph_capi_tests_contraction.cpp
new file mode 100644
index 00000000000..93c2c43e9ad
--- /dev/null
+++ b/cpp/nvgraph/cpp/tests/nvgraph_capi_tests_contraction.cpp
@@ -0,0 +1,666 @@
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include <functional>
+#include <iterator>
+#include <fstream>
+#include <cassert>
+#include <sstream>
+#include <string>
+#include <cstdio>
+
+#include "gtest/gtest.h"
+#include "valued_csr_graph.hxx"
+#include "nvgraphP.h"
+#include "nvgraph.h"
+
+//annonymus:
+namespace{
+template<typename Vector>
+void fill_contraction_data(const std::string& fname,
+                           Vector& g_row_offsets,
+                           Vector& g_col_indices,
+                           Vector& aggregates,
+                           Vector& cg_row_offsets,
+                           Vector& cg_col_indices)
+{
+  typedef typename Vector::value_type T;
+  std::ifstream m_stream(fname.c_str(), std::ifstream::in);
+  std::string line;
+
+  if( !m_stream.is_open() )
+    {
+      std::stringstream ss;
+      ss<<"ERROR: Could not open file: "<<fname;
+      throw std::runtime_error(ss.str().c_str());
+    }
+
+  bool keep_going = !std::getline(m_stream, line).eof();
+
+  //debug:
+  //std::cout<<line<<std::endl;
+
+  if( !keep_going )
+    return;
+
+  char c;
+  int g_nrows=0;
+  int g_nnz=0;
+  std::sscanf(line.c_str(),"%c: nrows=%d, nnz=%d",&c, &g_nrows, &g_nnz);
+
+  //debug:
+  //std::cout<<c<<","<<g_nrows<<","<<g_nnz<<"\n";
+  int n_entries = g_nrows+1;
+  g_row_offsets.reserve(n_entries);
+
+  //ignore next line:
+  //
+  if( !std::getline(m_stream, line) ) return;
+
+  //read G row_offsets:
+  for(int i=0;(i<n_entries) && keep_going;++i)
+    {
+      T value(0);
+      
+      keep_going = !std::getline(m_stream, line).eof();
+      std::stringstream ss(line);
+      ss >> value;
+      g_row_offsets.push_back(value);
+    }
+
+  //ignore next 2 lines:
+  //
+  if( !std::getline(m_stream, line) || !std::getline(m_stream, line) ) return;
+
+  g_col_indices.reserve(g_nnz);
+
+  //read G col_indices:
+  for(int i=0;(i<g_nnz) && keep_going;++i)
+    {
+      T value(0);
+      
+      keep_going = !std::getline(m_stream, line).eof();
+      std::stringstream ss(line);
+      ss >> value;
+      g_col_indices.push_back(value);
+    }
+
+  //ignore next line:
+  //
+  if( !std::getline(m_stream, line) ) return;
+
+  //remove the following for extraction:
+  //{
+  if( !std::getline(m_stream, line) ) return;
+  int n_aggs = 0;
+  std::sscanf(line.c_str(),"aggregate: size=%d",&n_aggs);
+
+  //assert( n_aggs == g_nrows );//not true for subgraph extraction!
+
+  aggregates.reserve(n_aggs);
+
+  //read aggregate:
+  for(int i=0;(i<n_aggs) && keep_going;++i)
+    {
+      T value(0);
+      
+      keep_going = !std::getline(m_stream, line).eof();
+      std::stringstream ss(line);
+      ss >> value;
+      aggregates.push_back(value);
+    }
+  //} end remove code for extraction
+  
+  if( !keep_going || !std::getline(m_stream, line) ) return;
+  int cg_nrows=0;
+  int cg_nnz=0;
+  std::sscanf(line.c_str(),"result %c: nrows=%d, nnz=%d",&c, &cg_nrows, &cg_nnz);
+
+  //debug:
+  std::cout<<c<<","<<cg_nrows<<","<<cg_nnz<<"\n";
+
+  //
+  //m_stream.close();//not really needed...destructor handles this
+  //return;
+
+  
+  n_entries = cg_nrows+1;
+  cg_row_offsets.reserve(n_entries);
+
+  //ignore next line:
+  //
+  if( !std::getline(m_stream, line) ) return;
+
+  //read G row_offsets:
+  for(int i=0;(i<n_entries) && keep_going;++i)
+    {
+      T value(0);
+      
+      keep_going = !std::getline(m_stream, line).eof();
+      std::stringstream ss(line);
+      ss >> value;
+      cg_row_offsets.push_back(value);
+    }
+
+  //ignore next 2 lines:
+  //
+  if( !std::getline(m_stream, line) || !std::getline(m_stream, line) ) return;
+
+  cg_col_indices.reserve(cg_nnz);
+
+  //read G col_indices:
+  for(int i=0;(i<cg_nnz) && keep_going;++i)
+    {
+      T value(0);
+      
+      keep_going = !std::getline(m_stream, line).eof();
+      std::stringstream ss(line);
+      ss >> value;
+      cg_col_indices.push_back(value);
+    }
+  
+
+  m_stream.close();//not really needed...destructor handles this
+}
+
+template<typename Vector> 
+bool check_diffs(const Vector& v1, const Vector& v2)
+{
+  typedef typename Vector::value_type T;
+
+  Vector v(v1.size(), 0);
+  std::transform(v1.begin(), v1.end(),
+                 v2.begin(),
+                 v.begin(),
+                 std::minus<T>());
+
+  if( std::find_if(v.begin(), v.end(), std::bind2nd(std::not_equal_to<T>(), 0)) != v.end() )
+    return true;
+  else
+    return false;
+}
+
+//check if sort(delta(r1)) == sort(delta(r2))
+//where delta(r)={r[i+1]-r[i] | i <- [0..|r|-1]}
+//
+template<typename Vector> 
+bool check_delta_invariant(const Vector& r1, const Vector& r2)
+{
+  typedef typename Vector::value_type T;
+
+  size_t sz = r1.size();
+  assert( sz == r2.size() );
+
+  Vector d1(sz-1);
+
+  std::transform(r1.begin()+1, r1.end(),
+                 r1.begin(),
+                 d1.begin(),
+                 std::minus<int>());
+
+  Vector d2(sz-1);
+
+  std::transform(r2.begin()+1, r2.end(),
+                 r2.begin(),
+                 d2.begin(),
+                 std::minus<int>());
+
+  std::sort(d1.begin(), d1.end());
+  std::sort(d2.begin(), d2.end());
+
+  return (d1 == d2);
+}
+}
+
+
+class NvgraphCAPITests_ContractionCSR : public ::testing::Test {
+  public:
+    NvgraphCAPITests_ContractionCSR() : nvgraph_handle(NULL), initial_graph(NULL) {}
+
+  protected:
+    static void SetupTestCase() 
+    {
+    }
+    static void TearDownTestCase() 
+    {
+    }
+    virtual void SetUp() 
+    {
+        if (nvgraph_handle == NULL) {
+            status = nvgraphCreate(&nvgraph_handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+        
+        // set up graph
+        status = nvgraphCreateGraphDescr(nvgraph_handle, &initial_graph);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        nvgraphCSRTopology32I_st topoData;
+        topoData.nvertices = 5;
+        topoData.nedges = 9;
+        int neighborhood[] = {0, 2, 3, 5, 7, 9};     //row_offsets
+        int edgedest[] = {1, 3, 3, 1, 4, 0, 2, 2, 4};//col_indices
+        topoData.source_offsets = neighborhood;
+        topoData.destination_indices = edgedest;
+        status = nvgraphSetGraphStructure(nvgraph_handle, initial_graph,(void*) &topoData, NVGRAPH_CSR_32);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // set up graph data
+        size_t numsets = 2;
+        float vertexvals0[] = {0.1f, 0.15893e-20f, 1e27f, 13.2f, 0.f};
+        float vertexvals1[] = {13., 322.64, 1e28, -1.4, 22.3};
+        void*  vertexptr[] = {(void*)vertexvals0, (void*)vertexvals1};
+        cudaDataType_t type_v[] = {CUDA_R_32F, CUDA_R_32F};
+        float edgevals0[] = {0.1f, 0.9153e-20f, 0.42e27f, 185.23, 1e21f, 15.6f, 215.907f, 912.2f, 0.2f};
+        float edgevals1[] = {13., 322.64, 1e28, 197534.2, 0.1, 0.425e-5, 5923.4, 0.12e-12, 52.};
+        void*  edgeptr[] = {(void*)edgevals0, (void*)edgevals1};
+        cudaDataType_t type_e[] = {CUDA_R_32F, CUDA_R_32F};
+
+        status = nvgraphAllocateVertexData(nvgraph_handle, initial_graph, numsets, type_v);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetVertexData(nvgraph_handle, initial_graph, (void *)vertexptr[0], 0 );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetVertexData(nvgraph_handle, initial_graph, (void *)vertexptr[1], 1 );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphAllocateEdgeData(nvgraph_handle, initial_graph, numsets, type_e);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetEdgeData(nvgraph_handle, initial_graph, (void *)edgeptr[0], 0 );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetEdgeData(nvgraph_handle, initial_graph, (void *)edgeptr[1], 1 );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        //save data - those will be available in the tests directly
+        graph_neigh.assign(neighborhood, neighborhood + topoData.nvertices + 1);
+        graph_edged.assign(edgedest, edgedest + topoData.nedges);
+        graph_vvals0.assign(vertexvals0, vertexvals0 + topoData.nvertices);
+        graph_vvals1.assign(vertexvals1, vertexvals1 + topoData.nvertices);
+        graph_evals0.assign(edgevals0, edgevals0 + topoData.nedges);
+        graph_evals1.assign(edgevals1, edgevals1 + topoData.nedges);
+    }
+    virtual void TearDown() 
+    {
+        // destroy graph
+        if (nvgraph_handle != NULL)
+        {
+          status = nvgraphDestroyGraphDescr(nvgraph_handle, initial_graph);
+          ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+          nvgraph_handle = NULL;
+        }
+        // release library
+        if (nvgraph_handle != NULL) {
+            status = nvgraphDestroy(nvgraph_handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            nvgraph_handle = NULL;
+        }
+    }
+    nvgraphStatus_t status;
+    nvgraphHandle_t nvgraph_handle;
+    nvgraphGraphDescr_t initial_graph;
+
+    std::vector<int> graph_neigh;
+    std::vector<int> graph_edged;
+    std::vector<float> graph_vvals0;
+    std::vector<float> graph_vvals1;
+    std::vector<float> graph_evals0;
+    std::vector<float> graph_evals1;
+};
+ 
+TEST_F(NvgraphCAPITests_ContractionCSR, CSRContractionTestCreation)
+{
+    nvgraphStatus_t status;
+    nvgraphGraphDescr_t temp_graph1 = NULL;//, temp_graph2 = NULL;
+
+    {
+        status = nvgraphCreateGraphDescr(nvgraph_handle, &temp_graph1);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        //size_t numaggregates = 3;
+        size_t szaggregates = 5;
+        int aggregates[] = {0, 1, 1, 0, 2};
+
+        //exception is being dumped by GTEST after [RUN]!
+        //so try-catch is not needed and it doesn't help with that
+        //
+        try{
+          int mult = 0;
+          int sum = 1;
+          status = nvgraphContractGraph(nvgraph_handle, initial_graph, temp_graph1,
+                                        aggregates, 
+                                        szaggregates, 
+                                        (nvgraphSemiringOps_t)mult,
+                                        (nvgraphSemiringOps_t)sum,
+                                        (nvgraphSemiringOps_t)mult,
+                                        (nvgraphSemiringOps_t)sum,
+                                        0);//unused
+        }
+        catch( const std::exception& ex )
+        {
+          // dump exception:
+          std::cerr<< "Exception:"<<ex.what()<<std::endl;//nope, but exception is being dumped by GTEST after [RUN]!
+          
+          //ASSERT_STREQ( "Exception:", ex.what() );//nope...
+        }
+        catch(...)
+        {
+          std::cerr<< "Exception: Unknown"<<std::endl;//nope, but exception is being dumped by GTEST after [RUN]!
+          
+          //ASSERT_STREQ( "Exception:", "Unknown" );//nope...
+        }
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        		
+		nvgraphCSRTopology32I_st tData;
+		tData.source_offsets=NULL;
+		tData.destination_indices=NULL;
+		status = nvgraphGetGraphStructure(nvgraph_handle, temp_graph1, (void*) &tData, NULL);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		const int nv = 3;
+		const int ne = 7;
+
+		ASSERT_EQ(tData.nvertices, nv);
+		ASSERT_EQ(tData.nedges, ne);
+
+		float  getVvals0[nv];
+		float  getVvals1[nv];
+		float  getEvals0[ne];
+		float  getEvals1[ne];
+
+		status = nvgraphGetVertexData(nvgraph_handle, temp_graph1, (void *)getVvals0, 0);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		status = nvgraphGetVertexData(nvgraph_handle, temp_graph1, (void *)getVvals1, 1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		status = nvgraphGetEdgeData(nvgraph_handle, temp_graph1, (void *)getEvals0, 0);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		status = nvgraphGetEdgeData(nvgraph_handle, temp_graph1, (void *)getEvals1, 1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    }
+    
+    status = nvgraphDestroyGraphDescr(nvgraph_handle, temp_graph1);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+}
+
+TEST_F(NvgraphCAPITests_ContractionCSR, CSRContractionNegative)
+{
+    nvgraphStatus_t status;
+    
+    {
+        nvgraphGraphDescr_t temp_graph2 = NULL;
+        status = nvgraphCreateGraphDescr(nvgraph_handle, &temp_graph2);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        size_t szaggregates = 3;
+        int aggregates[] = {0, 1, 2};//this should fail because size of aggregates should match n_vertices of original graph
+
+        //exception is being dumped by GTEST after [RUN]!
+        //so try-catch is not needed and it doesn't help with that
+        //
+        try{
+          int mult = 0;
+          int sum = 1;
+          status = nvgraphContractGraph(nvgraph_handle, initial_graph, temp_graph2,
+                                        aggregates, 
+                                        szaggregates, 
+                                        (nvgraphSemiringOps_t)mult,
+                                        (nvgraphSemiringOps_t)sum,
+                                        (nvgraphSemiringOps_t)mult,
+                                        (nvgraphSemiringOps_t)sum,
+                                        0);//unused
+        }
+        catch( const std::exception& ex )
+        {
+          // dump exception:
+          std::cerr<< "Exception:"<<ex.what()<<std::endl;//nope, but exception is being dumped by GTEST after [RUN]!
+          
+          //ASSERT_STREQ( "Exception:", ex.what() );//nope...
+        }
+        catch(...)
+        {
+          std::cerr<< "Exception: Unknown"<<std::endl;//nope, but exception is being dumped by GTEST after [RUN]!
+          
+          //ASSERT_STREQ( "Exception:", "Unknown" );//nope...
+        }
+
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+
+        status = nvgraphDestroyGraphDescr(nvgraph_handle, temp_graph2);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    }
+
+    {
+        nvgraphGraphDescr_t temp_graph2 = NULL;
+        status = nvgraphCreateGraphDescr(nvgraph_handle, &temp_graph2);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        size_t szaggregates = 5;
+        int aggregates[] = {0, 0, 1, 1, 3};//this should fail because not whole range [0..max(aggregates[])] is covered
+
+        //exception is being dumped by GTEST after [RUN]!
+        //so try-catch is not needed and it doesn't help with that
+        //
+        try{
+          int mult = 0;
+          int sum = 1;
+          status = nvgraphContractGraph(nvgraph_handle, initial_graph, temp_graph2,
+                                        aggregates, 
+                                        szaggregates, 
+                                        (nvgraphSemiringOps_t)mult,
+                                        (nvgraphSemiringOps_t)sum,
+                                        (nvgraphSemiringOps_t)mult,
+                                        (nvgraphSemiringOps_t)sum,
+                                        0);//unused
+        }
+        catch( const std::exception& ex )
+        {
+          // dump exception:
+          std::cerr<< "Exception:"<<ex.what()<<std::endl;//nope, but exception is being dumped by GTEST after [RUN]!
+          
+          //ASSERT_STREQ( "Exception:", ex.what() );//nope...
+        }
+        catch(...)
+        {
+          std::cerr<< "Exception: Unknown"<<std::endl;//nope, but exception is being dumped by GTEST after [RUN]!
+          
+          //ASSERT_STREQ( "Exception:", "Unknown" );//nope...
+        }
+
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+
+        status = nvgraphDestroyGraphDescr(nvgraph_handle, temp_graph2);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    }
+}
+
+TEST_F(NvgraphCAPITests_ContractionCSR, CSRContractionNetworkX)
+{
+    nvgraphStatus_t status;
+    
+    try{
+        nvgraphGraphDescr_t netx_graph       = NULL;
+        nvgraphGraphDescr_t contracted_graph = NULL;
+        
+        status = nvgraphCreateGraphDescr(nvgraph_handle, &netx_graph);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        
+        status = nvgraphCreateGraphDescr(nvgraph_handle, &contracted_graph);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        std::string fname("/mnt/nvgraph_test_data/graphs/networkx/ctr_test.dat");
+
+        std::vector<int> g_row_offsets;
+        std::vector<int> g_col_indices;
+
+        std::vector<int> aggregates;
+        std::vector<int> cg_row_offsets;
+        std::vector<int> cg_col_indices;
+
+        fill_contraction_data(fname,
+                              g_row_offsets,
+                              g_col_indices,
+                              aggregates,
+                              cg_row_offsets,
+                              cg_col_indices);
+
+        //std::cout<<"********* step 1: \n";
+
+        ASSERT_EQ( g_row_offsets.empty(), false);
+        ASSERT_EQ( g_col_indices.empty(), false);
+        ASSERT_EQ(    aggregates.empty(), false);
+        ASSERT_EQ(cg_row_offsets.empty(), false);
+        ASSERT_EQ(cg_col_indices.empty(), false);
+
+        //std::cout<<"********* step 1.1: \n";
+
+        ASSERT_EQ( g_col_indices.size(),  g_row_offsets.back() );
+        ASSERT_EQ( cg_col_indices.size(), cg_row_offsets.back());
+
+        //std::cout<<"********* step 1.2: \n";
+
+        nvgraphCSRTopology32I_st topoData;
+        topoData.nvertices = g_row_offsets.size()-1;//last is nnz
+        topoData.nedges = g_col_indices.size();
+
+        //std::cout<<"(n,m):"<<topoData.nvertices
+        //         <<", "<<topoData.nedges<<std::endl;
+
+        topoData.source_offsets      = &g_row_offsets[0];
+        topoData.destination_indices = &g_col_indices[0];
+
+        //std::cout<<"********* step 1.3: \n";
+
+        status = nvgraphSetGraphStructure(nvgraph_handle,
+                                          netx_graph,
+                                          (void*) &topoData,
+                                          NVGRAPH_CSR_32);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        //std::cout<<"********* step 2: \n";
+
+        size_t numsets = 1;
+        
+        std::vector<float> vdata(topoData.nvertices, 1.);
+        void* vptr[] =  {(void*) &vdata[0]};
+        cudaDataType_t type_v[] = {CUDA_R_32F};
+
+        std::vector<float> edata(topoData.nedges,    1.);
+        void* eptr[] =  {(void*) &edata[0]};
+        cudaDataType_t type_e[] = {CUDA_R_32F};
+
+        status = nvgraphAllocateVertexData(nvgraph_handle,
+                                           netx_graph,
+                                           numsets,
+                                           type_v);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        //std::cout<<"********* step 3: \n";
+        
+        status = nvgraphSetVertexData(nvgraph_handle,
+                                      netx_graph,
+                                      (void *)vptr[0],
+                                      0 );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        //std::cout<<"********* step 4: \n";
+
+        status = nvgraphAllocateEdgeData(nvgraph_handle,
+                                         netx_graph,
+                                         numsets,
+                                         type_e);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        //std::cout<<"********* step 5: \n";
+        
+        status = nvgraphSetEdgeData(nvgraph_handle,
+                                    netx_graph,
+                                    (void *)eptr[0],
+                                    0 );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        //std::cout<<"********* step 6: \n";
+
+        int mult = 0;
+        int sum = 1;
+        status = nvgraphContractGraph(nvgraph_handle,
+                                      netx_graph,
+                                      contracted_graph,
+                                      &aggregates[0], 
+                                      aggregates.size(), 
+                                      (nvgraphSemiringOps_t)mult,
+                                      (nvgraphSemiringOps_t)sum,
+                                      (nvgraphSemiringOps_t)mult,
+                                      (nvgraphSemiringOps_t)sum,
+                                      0);//unused
+
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        //std::cout<<"********* step 7: \n";
+
+        nvgraphCSRTopology32I_st tData;
+		tData.source_offsets=NULL;
+		tData.destination_indices=NULL;
+
+        //1st time to get nvertices and nedges
+        //
+		status = nvgraphGetGraphStructure(nvgraph_handle, contracted_graph, (void*) &tData, NULL);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        //std::cout<<"********* step 8: \n";
+
+        int cgnv = cg_row_offsets.size()-1;
+        int cgne = cg_col_indices.size();
+        ASSERT_EQ(tData.nvertices, cgnv);
+		ASSERT_EQ(tData.nedges, cgne);
+
+        //std::cout<<"********* step 9: \n";
+        
+        std::vector<int> cgro(cgnv+1, 0);
+        std::vector<int> cgci(cgne, 0);
+
+        tData.source_offsets = &cgro[0];
+		tData.destination_indices = &cgci[0];
+
+        //2nd time to get row_offsets and column_indices
+        //
+        status = nvgraphGetGraphStructure(nvgraph_handle, contracted_graph, (void*) &tData, NULL);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        //std::cout << "cg row_offsets:\n";
+        //std::copy(cgro.begin(), cgro.end(),
+        //          std::ostream_iterator<int>(std::cout,"\n"));
+
+        //std::cout << "cg col_indices:\n";
+        //std::copy(cgci.begin(), cgci.end(),
+        //          std::ostream_iterator<int>(std::cout,"\n"));
+
+        //PROBLEM: might differ due to different vertex numbering
+        //
+        ///ASSERT_EQ(check_diffs(cg_row_offsets, cgro), false);
+        ///ASSERT_EQ(check_diffs(cg_col_indices, cgci), false);
+
+        //this is one invariant we can check, besides vector sizes:
+        //
+        ASSERT_EQ( check_delta_invariant( cg_row_offsets, cgro ), true);
+
+        //std::cout<<"********* step 10: \n";
+
+        status = nvgraphDestroyGraphDescr(nvgraph_handle, contracted_graph);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        status = nvgraphDestroyGraphDescr(nvgraph_handle, netx_graph);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    }
+    catch( const std::exception& ex )
+      {
+        // dump exception:
+        std::cerr<< "Exception:"<<ex.what()<<std::endl;
+      }
+    catch(...)
+      {
+        std::cerr<< "Exception: Unknown"<<std::endl;
+      }
+}
+
+int main(int argc, char **argv) 
+{
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/cpp/nvgraph/cpp/tests/nvgraph_capi_tests_conversion.cpp b/cpp/nvgraph/cpp/tests/nvgraph_capi_tests_conversion.cpp
new file mode 100644
index 00000000000..8cf84b3bf17
--- /dev/null
+++ b/cpp/nvgraph/cpp/tests/nvgraph_capi_tests_conversion.cpp
@@ -0,0 +1,1416 @@
+#include <vector>
+// #include "boost/tuple/tuple.hpp"
+#include <algorithm>
+#include <stdlib.h>
+#include <time.h>
+#include <limits>
+
+#include "gtest/gtest.h"
+#include "nvgraph.h"
+#include <valued_csr_graph.hxx>
+#include <multi_valued_csr_graph.hxx>
+#include <nvgraphP.h>  // private header, contains structures, and potentially other things, used in the public C API that should never be exposed.
+
+#include "convert_preset_testcases.h"
+
+#define DEBUG_MSG std::cout      << "-----------> " << __FILE__ << " " << __LINE__ << std::endl;
+#define DEBUG_VAR(var) std::cout << "-----------> " << __FILE__ << " " << __LINE__ << ": " << #var"=" << var << std::endl;
+
+
+typedef enum
+{
+    CSR_32 = 0,
+    CSC_32 = 1,
+    COO_DEFAULT_32 = 2,
+    COO_UNSORTED_32 = 3,
+    COO_SOURCE_32 = 4,
+    COO_DESTINATION_32 = 5
+} testTopologyType_t;
+
+// ref functions taken from cuSparse
+template <typename T_ELEM>
+void ref_csr2csc (int m, int n, int nnz, const T_ELEM *csrVals, const int *csrRowptr, const int *csrColInd, T_ELEM *cscVals, int *cscRowind, int *cscColptr, int base=0){
+    int i,j, row, col, index;
+    int * counters;
+    T_ELEM val;
+
+    /* early return */
+    if ((m <= 0) || (n <= 0) || (nnz <= 0)){
+        return;
+    }
+
+    /* build compressed column pointers */
+    memset(cscColptr, 0, (n+1)*sizeof(cscColptr[0]));
+    cscColptr[0]=base;
+    for (i=0; i<nnz; i++){
+        cscColptr[1+csrColInd[i]-base]++;
+    }
+    for(i=0; i<n; i++){
+        cscColptr[i+1]+=cscColptr[i];
+    }
+
+    /* expand row indecis and copy them and values into csc arrays according to permutation */
+    counters = (int *)malloc(n*sizeof(counters[0]));
+    memset(counters, 0, n*sizeof(counters[0]));
+    for (i=0; i<m; i++){
+        for (j=csrRowptr[i]; j<csrRowptr[i+1]; j++){
+            row = i+base;
+            col = csrColInd[j-base];
+
+            index=cscColptr[col-base]-base+counters[col-base];
+            counters[col-base]++;
+
+            cscRowind[index]=row;
+
+            if(csrVals!=NULL || cscVals!=NULL){
+                val = csrVals[j-base];
+                cscVals[index]  = val;
+            }
+        }
+    }
+    free(counters);
+}
+
+// Not from cusparse (nvbug: 1762491)
+static void ref_coo2csr(const int *cooRowindx, int nnz, int m, int *csrRowPtr, int base=0){
+
+    memset(csrRowPtr, 0, sizeof(int)*(m+1) ); // Fill csrRowPtr with zeros
+    for (int i=0; i<nnz; i++){ // fill csrRowPtr with number of nnz per row
+        int idx = cooRowindx[i]-base;
+        csrRowPtr[idx]++;
+    }
+
+    int t = base; // total sum
+    for(int i=0; i<m; i++){
+        int temp = csrRowPtr[i];
+        csrRowPtr[i] = t;
+        t += temp;
+    }
+    csrRowPtr[m] = nnz + base; // last element is trivial
+}
+
+void ref_csr2coo(const int *csrRowindx, int nnz, int m, int *cooRowindx){
+    int base;
+
+    cooRowindx[0] = csrRowindx[0];
+    base = csrRowindx[0];
+
+    for( int j = 0; j < m; j++) {
+        int colStart = csrRowindx[j] - base;
+        int colEnd   = csrRowindx[j+1]  - base;
+        int rowNnz   = colEnd - colStart;
+
+        for ( int i = 0; i < rowNnz; i++) {
+            cooRowindx[colStart+i] = j + base;
+        }
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+// sort by row/col functions (not from cusparse)
+////////////////////////////////////////////////////////////////////////////////////////////////
+struct comparator{
+    const std::vector<int>& values;
+    comparator(const std::vector<int>& val_vec): values(val_vec) {}
+    bool operator()(int n, int m){
+        return values[n] < values[m];
+    }
+};
+
+template<typename T>
+void getSortPermutation(const std::vector<T>& minorOrder, const std::vector<T>& majorOrder, std::vector<int>& p){
+    int n = majorOrder.size();
+    p.clear();
+    p.reserve(n);
+    for(int i=0; i < n; ++i)
+        p.push_back(i);
+
+    std::stable_sort(p.begin(), p.end(), comparator(minorOrder)); // first "minor" sort
+    std::stable_sort(p.begin(), p.end(), comparator(majorOrder)); // second "major" sort
+}
+
+template<typename T>
+void ref_cooSortBySource(int n,
+    const T *srcData, const int *srcRow, const int *srcCol,
+    T *dstData, int *dstRow, int *dstCol){
+
+    std::vector<int> srcR(srcRow, srcRow + n);
+    std::vector<int> srcC(srcCol, srcCol + n);
+    std::vector<int> p(n, 0);
+    getSortPermutation(srcC, srcR, p); // sort p according to srcC
+
+    for (int i=0; i<n ; i++) {
+        dstRow[i]=srcRow[p[i]];
+        dstCol[i]=srcCol[p[i]];
+        dstData[i]=srcData[p[i]];
+    }
+}
+
+template<typename T>
+void ref_cooSortByDestination(int nnz,
+    const T *srcData, const int *srcRow, const int *srcCol,
+    T *dstData, int *dstRow, int *dstCol){
+    ref_cooSortBySource(nnz, srcData, srcCol, srcRow, dstData, dstCol, dstRow);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+// Random generators
+////////////////////////////////////////////////////////////////////////////////////////////////
+void randomArray(int n, void* arr, cudaDataType_t *dataType){
+    if(*dataType==CUDA_R_32F){
+        float* a = (float*)arr;
+        for(int i=0; i<n; ++i)
+            a[i] = (float)rand()/(rand()+1); // don't divide by 0.
+    } else if(*dataType==CUDA_R_64F) {
+        double* a = (double*)arr;
+        for(int i=0; i<n; ++i)
+            a[i] = (double)rand()/(rand()+1); // don't divide by 0.
+    } else {
+        FAIL();
+    }
+}
+
+void randomCOOGenerator( int *rowInd, int *colInd, int *nnz, int n,
+                        int maxPerRow, int maxjump, int max_nnz) {
+
+    int nnzCounter = 0;
+    for(int row = 0 ; row<n && nnzCounter<max_nnz; row++){
+        int elementsPerRow = 0;
+        int col = 0;
+        while( elementsPerRow<maxPerRow && nnzCounter<max_nnz ){
+            int jump = (rand() % maxjump) +1;
+            col += jump;
+            if (col >= n)
+                break;
+            rowInd[nnzCounter] = row;
+            colInd[nnzCounter] = col;
+            nnzCounter++;
+            elementsPerRow++;
+        }
+    }
+    *nnz = nnzCounter;
+}
+
+void randomCsrGenerator( int *rowPtr, int *colInd, int *nnz, int n,
+                         int maxPerRow, int maxjump, int max_nnz) {
+
+    int *rowInd = (int*)malloc (sizeof(int)*max_nnz);
+    randomCOOGenerator(rowInd, colInd, nnz, n, maxPerRow, maxjump, max_nnz);
+    ref_coo2csr(rowInd, *nnz, n, rowPtr);
+    free(rowInd);
+}
+
+typedef enum{
+    HOST       = 0,
+    DEVICE     = 1
+} addressSpace_t;
+
+
+class NVGraphAPIConvertTest : public ::testing::Test {
+  public:
+    nvgraphStatus_t status;
+    nvgraphHandle_t handle;
+
+    NVGraphAPIConvertTest() : handle(NULL) {}
+
+    // static void SetupTestCase() {}
+    // static void TearDownTestCase() {}
+    virtual void SetUp() {
+        if (handle == NULL) {
+            status = nvgraphCreate(&handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+        srand (time(NULL));
+    }
+    virtual void TearDown() {
+        if (handle != NULL) {
+            status =  nvgraphDestroy(handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            handle = NULL;
+        }
+    }
+
+    // CPU conversion (reference)
+    template <typename T>
+    static void refConvert(nvgraphTopologyType_t srcTType, void *srcTopology, const T *srcEdgeData,
+                           nvgraphTopologyType_t dstTType, void *dstTopology, T *dstEdgeData){
+
+        // Trust me, this a 100 times better than nested ifs.
+        if(srcTType==NVGRAPH_CSR_32 && dstTType==NVGRAPH_CSR_32){                                                 // CSR2CSR
+            nvgraphCSRTopology32I_t srcT = static_cast<nvgraphCSRTopology32I_t >(srcTopology);
+            nvgraphCSRTopology32I_t dstT = static_cast<nvgraphCSRTopology32I_t >(dstTopology);
+            dstT->nvertices = srcT->nvertices;
+            dstT->nedges = srcT->nedges;
+            memcpy(dstEdgeData, srcEdgeData, sizeof(T)*srcT->nedges);
+            memcpy(dstT->source_offsets, srcT->source_offsets, sizeof(int)*(srcT->nvertices+1) );
+            memcpy(dstT->destination_indices, srcT->destination_indices, sizeof(int)*(srcT->nedges) );
+
+        } else if(srcTType==NVGRAPH_CSR_32 && dstTType==NVGRAPH_CSC_32) {                                         // CSR2CSC
+            nvgraphCSRTopology32I_t srcT = static_cast<nvgraphCSRTopology32I_t >(srcTopology);
+            nvgraphCSCTopology32I_t dstT = static_cast<nvgraphCSCTopology32I_t >(dstTopology);
+            dstT->nvertices = srcT->nvertices;
+            dstT->nedges = srcT->nedges;
+            ref_csr2csc<T> (srcT->nvertices, srcT->nvertices, srcT->nedges,
+                srcEdgeData, srcT->source_offsets, srcT->destination_indices,
+                dstEdgeData, dstT->source_indices, dstT->destination_offsets);
+
+        } else if(srcTType==NVGRAPH_CSR_32 && dstTType==NVGRAPH_COO_32) {                                         // CSR2COO
+            nvgraphCSRTopology32I_t srcT = static_cast<nvgraphCSRTopology32I_t >(srcTopology);
+            nvgraphCOOTopology32I_t dstT = static_cast<nvgraphCOOTopology32I_t >(dstTopology);
+            dstT->nvertices = srcT->nvertices;
+            dstT->nedges = srcT->nedges;
+            if(dstT->tag==NVGRAPH_DEFAULT || dstT->tag==NVGRAPH_UNSORTED || dstT->tag==NVGRAPH_SORTED_BY_SOURCE){
+                ref_csr2coo(srcT->source_offsets, srcT->nedges, srcT->nvertices, dstT->source_indices);
+                memcpy(dstT->destination_indices, srcT->destination_indices, sizeof(int)*(srcT->nedges) );
+                memcpy(dstEdgeData, srcEdgeData, sizeof(T)*(srcT->nedges) );
+            } else if (dstT->tag==NVGRAPH_SORTED_BY_DESTINATION) {
+                int* tmp=(int*)malloc(sizeof(int)*(dstT->nedges) );
+                // Step 1: Convert to COO Source
+                ref_csr2coo(srcT->source_offsets, srcT->nedges, srcT->nvertices, tmp);
+                // Step 2: Convert to COO Dest
+                ref_cooSortByDestination(srcT->nedges,
+                    srcEdgeData, tmp, srcT->destination_indices,
+                    dstEdgeData, dstT->source_indices, dstT->destination_indices);
+                free(tmp);
+            } else {
+                FAIL();
+            }
+
+        ///////////////////////////////////////////////////////////////////////////////////////////////////////////
+        } else if(srcTType==NVGRAPH_CSC_32 && dstTType==NVGRAPH_CSR_32) {                                         // CSC2CSR
+            nvgraphCSCTopology32I_t srcT = static_cast<nvgraphCSCTopology32I_t >(srcTopology);
+            nvgraphCSRTopology32I_t dstT = static_cast<nvgraphCSRTopology32I_t >(dstTopology);
+            dstT->nvertices = srcT->nvertices;
+            dstT->nedges = srcT->nedges;
+            ref_csr2csc<T> (srcT->nvertices, srcT->nvertices, srcT->nedges,
+                srcEdgeData, srcT->destination_offsets, srcT->source_indices,
+                dstEdgeData, dstT->destination_indices, dstT->source_offsets);
+
+        } else if(srcTType==NVGRAPH_CSC_32 && dstTType==NVGRAPH_CSC_32) {                                         // CSC2CSC
+            nvgraphCSCTopology32I_t srcT = static_cast<nvgraphCSCTopology32I_t >(srcTopology);
+            nvgraphCSCTopology32I_t dstT = static_cast<nvgraphCSCTopology32I_t >(dstTopology);
+            dstT->nvertices = srcT->nvertices;
+            dstT->nedges = srcT->nedges;
+            memcpy(dstT->destination_offsets, srcT->destination_offsets, sizeof(int)*(srcT->nvertices+1) );
+            memcpy(dstT->source_indices, srcT->source_indices, sizeof(int)*(srcT->nedges) );
+            memcpy(dstEdgeData, srcEdgeData, sizeof(T)*(srcT->nedges) );
+
+        } else if(srcTType==NVGRAPH_CSC_32 && dstTType==NVGRAPH_COO_32) {                                         // CSC2COO
+            nvgraphCSCTopology32I_t srcT = static_cast<nvgraphCSCTopology32I_t >(srcTopology);
+            nvgraphCOOTopology32I_t dstT = static_cast<nvgraphCOOTopology32I_t >(dstTopology);
+            dstT->nvertices = srcT->nvertices;
+            dstT->nedges = srcT->nedges;
+            if(dstT->tag==NVGRAPH_SORTED_BY_SOURCE){
+                int* tmp = (int*)malloc(sizeof(int)*(dstT->nedges));
+                // Step 1: Convert to COO Dest
+                ref_csr2coo(srcT->destination_offsets, srcT->nedges, srcT->nvertices, tmp);
+                // Step 2: Convert to COO Source
+                ref_cooSortBySource(srcT->nedges,
+                    srcEdgeData, srcT->source_indices, tmp,
+                    dstEdgeData, dstT->source_indices, dstT->destination_indices);
+                free(tmp);
+            } else if (dstT->tag==NVGRAPH_DEFAULT || dstT->tag==NVGRAPH_UNSORTED || dstT->tag==NVGRAPH_SORTED_BY_DESTINATION) {
+                ref_csr2coo(srcT->destination_offsets, srcT->nedges, srcT->nvertices, dstT->destination_indices);
+                memcpy(dstT->source_indices, srcT->source_indices, sizeof(int)*(srcT->nedges) );
+                memcpy(dstEdgeData, srcEdgeData, sizeof(T)*(srcT->nedges) );
+            } else {
+                FAIL();
+            }
+
+        ///////////////////////////////////////////////////////////////////////////////////////////////////////////
+        } else if(srcTType==NVGRAPH_COO_32 && dstTType==NVGRAPH_CSR_32) {                                         // COO2CSR
+            nvgraphCOOTopology32I_t srcT = static_cast<nvgraphCOOTopology32I_t >(srcTopology);
+            nvgraphCSRTopology32I_t dstT = static_cast<nvgraphCSRTopology32I_t >(dstTopology);
+            dstT->nvertices = srcT->nvertices;
+            dstT->nedges = srcT->nedges;
+            if(srcT->tag==NVGRAPH_SORTED_BY_SOURCE){
+                ref_coo2csr(srcT->source_indices, srcT->nedges, srcT->nvertices, dstT->source_offsets);
+                memcpy(dstT->destination_indices, srcT->destination_indices, sizeof(int)*(srcT->nedges) );
+                memcpy(dstEdgeData, srcEdgeData, sizeof(T)*(srcT->nedges) );
+
+            } else if(srcT->tag==NVGRAPH_SORTED_BY_DESTINATION || srcT->tag==NVGRAPH_DEFAULT || srcT->tag==NVGRAPH_UNSORTED){
+                int *tmp = (int*)malloc(sizeof(int)*(srcT->nedges) );
+                // Step 1: convert to COO Dest
+                ref_cooSortBySource(srcT->nedges,
+                    srcEdgeData, srcT->source_indices, srcT->destination_indices,
+                    dstEdgeData, tmp, dstT->destination_indices);
+                // Step 1: convert to CSC
+                ref_coo2csr(tmp, srcT->nedges, srcT->nvertices, dstT->source_offsets);
+                free(tmp);
+            } else {
+                FAIL();
+            }
+        } else if(srcTType==NVGRAPH_COO_32 && dstTType==NVGRAPH_CSC_32) {                                         // COO2CSC
+            nvgraphCOOTopology32I_t srcT = static_cast<nvgraphCOOTopology32I_t >(srcTopology);
+            nvgraphCSCTopology32I_t dstT = static_cast<nvgraphCSCTopology32I_t >(dstTopology);
+            dstT->nvertices = srcT->nvertices;
+            dstT->nedges = srcT->nedges;
+            if(srcT->tag==NVGRAPH_SORTED_BY_SOURCE || srcT->tag==NVGRAPH_DEFAULT || srcT->tag==NVGRAPH_UNSORTED){
+                int *tmp = (int*)malloc(sizeof(int)*srcT->nedges);
+                // Step 1: convert to COO dest
+                ref_cooSortByDestination(srcT->nedges,
+                    srcEdgeData, srcT->source_indices, srcT->destination_indices,
+                    dstEdgeData, dstT->source_indices, tmp);
+                // Step 1: convert to CSC
+                ref_coo2csr(tmp, srcT->nedges, srcT->nvertices, dstT->destination_offsets);
+                free(tmp);
+            } else if(srcT->tag==NVGRAPH_SORTED_BY_DESTINATION) {
+                ref_coo2csr(srcT->destination_indices, srcT->nedges, srcT->nvertices, dstT->destination_offsets);
+                memcpy(dstT->source_indices, srcT->source_indices, sizeof(int)*(srcT->nedges) );
+                memcpy(dstEdgeData, srcEdgeData, sizeof(T)*(srcT->nedges) );
+            } else {
+                FAIL();
+            }
+        } else if(srcTType==NVGRAPH_COO_32 && dstTType==NVGRAPH_COO_32) {                                         // COO2COO
+            nvgraphCOOTopology32I_t srcT = static_cast<nvgraphCOOTopology32I_t >(srcTopology);
+            nvgraphCOOTopology32I_t dstT = static_cast<nvgraphCOOTopology32I_t >(dstTopology);
+            dstT->nvertices = srcT->nvertices;
+            dstT->nedges = srcT->nedges;
+            if(srcT->tag==dstT->tag || dstT->tag==NVGRAPH_DEFAULT || dstT->tag==NVGRAPH_UNSORTED) {
+                memcpy(dstT->source_indices, srcT->source_indices, sizeof(int)*(srcT->nedges) );
+                memcpy(dstT->destination_indices, srcT->destination_indices, sizeof(int)*(srcT->nedges) );
+                memcpy(dstEdgeData, srcEdgeData, sizeof(T)*srcT->nedges);
+            } else if(dstT->tag==NVGRAPH_SORTED_BY_SOURCE) {
+                ref_cooSortBySource(srcT->nedges,
+                    srcEdgeData, srcT->source_indices, srcT->destination_indices,
+                    dstEdgeData, dstT->source_indices, dstT->destination_indices);
+            } else if(dstT->tag==NVGRAPH_SORTED_BY_DESTINATION) {
+                ref_cooSortByDestination(srcT->nedges,
+                    srcEdgeData, srcT->source_indices, srcT->destination_indices,
+                    dstEdgeData, dstT->source_indices, dstT->destination_indices);
+            } else {
+                FAIL();
+            }
+
+        ///////////////////////////////////////////////////////////////////////////////////////////////////////////
+        } else {
+            FAIL();
+        }
+    }
+
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Topology  Helper functions
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////
+    // The function must be void
+    static void topoGetN(testTopologyType_t TType, void *topo, int* n){
+        int result=0;
+        if(TType==CSR_32){
+            nvgraphCSRTopology32I_t t = static_cast<nvgraphCSRTopology32I_t >(topo);
+            result = t->nvertices;
+        }
+        else if(TType==CSC_32){
+            nvgraphCSCTopology32I_t t = static_cast<nvgraphCSCTopology32I_t >(topo);
+            result = t->nvertices;
+        }
+        else if(TType==COO_SOURCE_32 || TType==COO_DESTINATION_32 || TType==COO_UNSORTED_32 || TType==COO_DEFAULT_32){
+            nvgraphCOOTopology32I_t t = static_cast<nvgraphCOOTopology32I_t >(topo);
+            result = t->nvertices;
+        }
+        else{
+            FAIL();
+        }
+        *n=result;
+    }
+
+    // The function must be void
+    static void topoGetNNZ(testTopologyType_t TType, void *topo, int*n){
+        int result=0;
+        if(TType==CSR_32){
+            nvgraphCSRTopology32I_t t = static_cast<nvgraphCSRTopology32I_t >(topo);
+            result = t->nedges;
+        }
+        else if(TType==CSC_32){
+            nvgraphCSCTopology32I_t t = static_cast<nvgraphCSCTopology32I_t >(topo);
+            result = t->nedges;
+        }
+        else if(TType==COO_SOURCE_32 || TType==COO_DESTINATION_32 || TType==COO_UNSORTED_32 || TType==COO_DEFAULT_32){
+            nvgraphCOOTopology32I_t t = static_cast<nvgraphCOOTopology32I_t >(topo);
+            result = t->nedges;
+        }
+        else{
+            FAIL();
+        }
+        *n=result;
+    }
+
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Allocation/de-allocation functions
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////
+    static void allocateTopo(void **topoPtr, testTopologyType_t TType, int n, int nnz, addressSpace_t aSpace){
+        if(TType==CSR_32){
+            *topoPtr=(nvgraphCSRTopology32I_t)malloc(sizeof(nvgraphCSRTopology32I_st));
+            nvgraphCSRTopology32I_t p = static_cast<nvgraphCSRTopology32I_t >(*topoPtr);
+
+            if(aSpace==HOST){
+                p->source_offsets = (int*)malloc(sizeof(int)*(n+1));
+                p->destination_indices = (int*)malloc(sizeof(int)*(nnz));
+            } else if(aSpace==DEVICE){
+                cudaMalloc((void**)&(p->source_offsets), sizeof(int)*(n+1));
+                cudaMalloc((void**)&(p->destination_indices), sizeof(int)*(nnz));
+            } else {
+                FAIL();
+            }
+            p->nvertices = n;
+            p->nedges = nnz;
+        }
+        else if(TType==CSC_32){
+            *topoPtr=(nvgraphCSCTopology32I_t)malloc(sizeof(nvgraphCSCTopology32I_st));
+            nvgraphCSCTopology32I_t p = static_cast<nvgraphCSCTopology32I_t >(*topoPtr);
+
+            if(aSpace==HOST){
+                p->destination_offsets = (int*)malloc(sizeof(int)*(n+1));
+                p->source_indices = (int*)malloc(sizeof(int)*(nnz));
+            } else if(aSpace==DEVICE){
+                cudaMalloc((void**)&(p->destination_offsets), sizeof(int)*(n+1));
+                cudaMalloc((void**)&(p->source_indices), sizeof(int)*(nnz));
+            } else {
+                FAIL();
+            }
+            p->nvertices = n;
+            p->nedges = nnz;
+        }
+        else if(TType==COO_SOURCE_32 || TType==COO_DESTINATION_32 || TType==COO_UNSORTED_32 || TType==COO_DEFAULT_32){
+            *topoPtr=(nvgraphCOOTopology32I_t)malloc(sizeof(nvgraphCOOTopology32I_st));
+            nvgraphCOOTopology32I_t p = static_cast<nvgraphCOOTopology32I_t >(*topoPtr);
+
+            if(aSpace==HOST){
+                p->source_indices = (int*)malloc(sizeof(int)*(nnz));
+                p->destination_indices = (int*)malloc(sizeof(int)*(nnz));
+            } else if(aSpace==DEVICE){
+                cudaMalloc((void**)&(p->source_indices), sizeof(int)*(nnz));
+                cudaMalloc((void**)&(p->destination_indices), sizeof(int)*(nnz));
+            } else {
+                FAIL();
+            }
+            p->nvertices = n;
+            p->nedges = nnz;
+
+            if(TType==COO_SOURCE_32)
+                p->tag=NVGRAPH_SORTED_BY_SOURCE;
+            else if(TType==COO_DESTINATION_32)
+                p->tag=NVGRAPH_SORTED_BY_DESTINATION;
+            else if(TType==COO_UNSORTED_32)
+                p->tag=NVGRAPH_UNSORTED;
+            else if(TType==COO_DEFAULT_32)
+                p->tag=NVGRAPH_DEFAULT;
+            else
+                FAIL();
+        } else {
+            FAIL();
+        }
+    }
+
+    static void deAllocateTopo(void* topo, testTopologyType_t TType, addressSpace_t aSpace){
+        if(topo==NULL)
+            return;
+
+        void *rowPtr, *colPtr;
+        if(TType==CSR_32){
+            nvgraphCSRTopology32I_t p = static_cast<nvgraphCSRTopology32I_t >(topo);
+            rowPtr = p->source_offsets;
+            colPtr = p->destination_indices;
+            free(p);
+        }
+        else if(TType==CSC_32){
+            nvgraphCSCTopology32I_t p = static_cast<nvgraphCSCTopology32I_t >(topo);
+            rowPtr = p->source_indices;
+            colPtr = p->destination_offsets;
+            free(p);
+        }
+        else if(TType==COO_SOURCE_32 || TType==COO_DESTINATION_32 || TType==COO_UNSORTED_32 || TType==COO_DEFAULT_32){
+            nvgraphCOOTopology32I_t p = static_cast<nvgraphCOOTopology32I_t >(topo);
+            rowPtr = p->source_indices;
+            colPtr = p->destination_indices;
+            free(p);
+        } else {
+            FAIL();
+        }
+
+        if(aSpace==HOST){
+            free(rowPtr);
+            free(colPtr);
+        } else if (aSpace==DEVICE){
+            cudaFree(rowPtr);
+            cudaFree(colPtr);
+        } else {
+            FAIL();
+        }
+    }
+
+    static void cpyTopo(void *dst, void *src, testTopologyType_t TType, enum cudaMemcpyKind kind=cudaMemcpyDefault){
+
+        int *srcRow=NULL, *srcCol=NULL;
+        int *dstRow=NULL, *dstCol=NULL;
+        int rowSize=0, colSize=0;
+        if(TType==CSR_32) {
+            nvgraphCSRTopology32I_t srcT = static_cast<nvgraphCSRTopology32I_t >(src);
+            nvgraphCSRTopology32I_t dstT = static_cast<nvgraphCSRTopology32I_t >(dst);
+            dstT->nvertices = srcT->nvertices;
+            dstT->nedges = srcT->nedges;
+            rowSize = srcT->nvertices+1; colSize = srcT->nedges;
+            srcRow = srcT->source_offsets; dstRow = dstT->source_offsets;
+            srcCol = srcT->destination_indices; dstCol = dstT->destination_indices;
+        } else if(TType==CSC_32) {
+            nvgraphCSCTopology32I_t srcT = static_cast<nvgraphCSCTopology32I_t >(src);
+            nvgraphCSCTopology32I_t dstT = static_cast<nvgraphCSCTopology32I_t >(dst);
+            dstT->nvertices = srcT->nvertices;
+            dstT->nedges = srcT->nedges;
+            rowSize = srcT->nedges; colSize = srcT->nvertices+1;
+            srcRow = srcT->source_indices; dstRow = dstT->source_indices;
+            srcCol = srcT->destination_offsets; dstCol = dstT->destination_offsets;
+        } else if(TType==COO_SOURCE_32 || TType==COO_DESTINATION_32 || TType==COO_UNSORTED_32 || TType==COO_DEFAULT_32) {
+            nvgraphCOOTopology32I_t srcT = static_cast<nvgraphCOOTopology32I_t >(src);
+            nvgraphCOOTopology32I_t dstT = static_cast<nvgraphCOOTopology32I_t >(dst);
+            dstT->nvertices = srcT->nvertices;
+            dstT->nedges = srcT->nedges;
+            dstT->tag = srcT->tag;
+            rowSize = srcT->nedges; colSize = srcT->nedges;
+            srcRow = srcT->source_indices; dstRow = dstT->source_indices;
+            srcCol = srcT->destination_indices; dstCol = dstT->destination_indices;
+        } else {
+            FAIL();
+        }
+
+        ASSERT_EQ(cudaSuccess, cudaMemcpy(dstRow, srcRow, sizeof(int)*rowSize, kind));
+        ASSERT_EQ(cudaSuccess, cudaMemcpy(dstCol, srcCol, sizeof(int)*colSize, kind));
+    }
+
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Comparison functions
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////
+    template<typename T>
+    static void cmpArray(T* ref, addressSpace_t refSapce, T* dst, addressSpace_t dstSpace, int n){
+
+        T *_refData=NULL, *_dstData=NULL; if(refSapce==DEVICE){
+            _refData = (T*)malloc(sizeof(T)*n);
+            cudaMemcpy(_refData, ref, sizeof(T)*n, cudaMemcpyDefault);
+        } else {
+            _refData = ref;
+        }
+
+        if(dstSpace==DEVICE){
+            _dstData = (T*)malloc(sizeof(T)*n);
+            cudaMemcpy(_dstData, dst, sizeof(T)*n, cudaMemcpyDefault);
+        } else {
+            _dstData = dst;
+        }
+        std::vector<T> refData;
+        std::vector<T> dstData;
+        refData.assign(_refData, _refData + n);
+        dstData.assign(_dstData, _dstData + n);
+
+        for(int i=0; i<refData.size(); ++i)
+            ASSERT_EQ(refData[i], dstData[i]);
+        // ASSERT_EQ(refData, dstData);
+
+
+        if(refSapce==DEVICE)  free(_refData);
+        if(dstSpace==DEVICE)  free(_dstData);
+    }
+
+    static void cmpTopo(nvgraphTopologyType_t TType, void *refTopology, addressSpace_t refSpace, void *dstTopology, addressSpace_t dstSpace){
+
+        int *_refRows=NULL, *_refCols=NULL;
+        int *_dstRows=NULL, *_dstCols=NULL;
+        int *refRowsHost=NULL, *refColsHost=NULL;
+        int *dstRowsHost=NULL, *dstColsHost=NULL;
+
+        int rowSize=0, colSize=0;
+        if(TType==NVGRAPH_CSR_32){
+            nvgraphCSRTopology32I_t _refTopology = static_cast<nvgraphCSRTopology32I_t >(refTopology);
+            nvgraphCSRTopology32I_t _dstTopology = static_cast<nvgraphCSRTopology32I_t >(dstTopology);
+            ASSERT_EQ( _refTopology->nvertices, _dstTopology->nvertices);
+            ASSERT_EQ( _refTopology->nedges, _dstTopology->nedges);
+            _refRows = _refTopology->source_offsets;
+            _refCols = _refTopology->destination_indices;
+            _dstRows = _dstTopology->source_offsets;
+            _dstCols = _dstTopology->destination_indices;
+            colSize = _refTopology->nedges;
+            rowSize = _refTopology->nvertices + 1;
+        }
+        else if(TType==NVGRAPH_CSC_32){
+            nvgraphCSCTopology32I_t _refTopology = static_cast<nvgraphCSCTopology32I_t >(refTopology);
+            nvgraphCSCTopology32I_t _dstTopology = static_cast<nvgraphCSCTopology32I_t >(dstTopology);
+            ASSERT_EQ( _refTopology->nvertices, _dstTopology->nvertices);
+            ASSERT_EQ( _refTopology->nedges, _dstTopology->nedges);
+            _refRows = _refTopology->source_indices;
+            _refCols = _refTopology->destination_offsets;
+            _dstRows = _dstTopology->source_indices;
+            _dstCols = _dstTopology->destination_offsets;
+            colSize = _refTopology->nvertices + 1;
+            rowSize = _refTopology->nedges;
+        }
+        else if(TType==NVGRAPH_COO_32){
+            nvgraphCOOTopology32I_t _refTopology = static_cast<nvgraphCOOTopology32I_t >(refTopology);
+            nvgraphCOOTopology32I_t _dstTopology = static_cast<nvgraphCOOTopology32I_t >(dstTopology);
+            ASSERT_EQ( _refTopology->nvertices, _dstTopology->nvertices);
+            ASSERT_EQ( _refTopology->nedges, _dstTopology->nedges);
+            ASSERT_EQ( _refTopology->tag, _dstTopology->tag);
+            _refRows = _refTopology->source_indices;
+            _refCols = _refTopology->destination_indices;
+            _dstRows = _dstTopology->source_indices;
+            _dstCols = _dstTopology->destination_indices;
+            colSize = _refTopology->nedges;
+            rowSize = _refTopology->nedges;
+        }
+        else{
+            FAIL();
+        }
+
+        if(refSpace==DEVICE){
+            refRowsHost = (int*)malloc(sizeof(int)*rowSize);
+            refColsHost = (int*)malloc(sizeof(int)*colSize);
+            cudaMemcpy(refRowsHost, _refRows, sizeof(int)*rowSize, cudaMemcpyDefault);
+            cudaMemcpy(refColsHost, _refCols, sizeof(int)*colSize, cudaMemcpyDefault);
+        } else {
+            refRowsHost = _refRows;
+            refColsHost = _refCols;
+        }
+
+        if(dstSpace==DEVICE){
+            dstRowsHost = (int*)malloc(sizeof(int)*rowSize);
+            dstColsHost = (int*)malloc(sizeof(int)*colSize);
+            cudaMemcpy(dstRowsHost, _dstRows, sizeof(int)*rowSize, cudaMemcpyDefault);
+            cudaMemcpy(dstColsHost, _dstCols, sizeof(int)*colSize, cudaMemcpyDefault);
+        } else {
+            dstRowsHost = _dstRows;
+            dstColsHost = _dstCols;
+        }
+        std::vector<int> refRows, refCols;
+        std::vector<int> dstRows, dstCols;
+        refRows.assign(refRowsHost, refRowsHost + rowSize);
+        refCols.assign(refColsHost, refColsHost + colSize);
+        dstRows.assign(dstRowsHost, dstRowsHost + rowSize);
+        dstCols.assign(dstColsHost, dstColsHost + colSize);
+
+        ASSERT_EQ(refRows, dstRows);
+        ASSERT_EQ(refCols, dstCols);
+        if(refSpace==DEVICE) {
+            free(refRowsHost);
+            free(refColsHost);
+        }
+        if(dstSpace==DEVICE){
+            free(dstRowsHost);
+            free(dstColsHost);
+        }
+    }
+
+    static nvgraphTopologyType_t testType2nvGraphType(testTopologyType_t type){
+        if(type==CSR_32)
+            return NVGRAPH_CSR_32;
+        else if(type==CSC_32)
+            return NVGRAPH_CSC_32;
+        else
+            return NVGRAPH_COO_32;
+    }
+
+    static nvgraphTag_t testType2tag(testTopologyType_t type){
+
+        if(type==COO_SOURCE_32)
+            return NVGRAPH_SORTED_BY_SOURCE;
+        else if(type==COO_DESTINATION_32)
+            return NVGRAPH_SORTED_BY_DESTINATION;
+        else if(type==COO_UNSORTED_32)
+            return NVGRAPH_UNSORTED;
+        else
+            return NVGRAPH_DEFAULT;
+    }
+
+};
+
+// Compares the convesion result from and to preset values (Used primary for simple test, and to validate reference convsrsion).
+class PresetTopology : public NVGraphAPIConvertTest,
+                       public ::testing::WithParamInterface<std::tr1::tuple< cudaDataType_t,                // dataType
+                                                                             testTopologyType_t,            // srcTopoType
+                                                                             testTopologyType_t,            // dstTopoType
+                                                                             presetTestContainer_st> > {    // prestTestContainer
+  public:
+    // Reference (CPU) conversion check
+    template <typename T>
+    static void refPrestConvertTest(testTopologyType_t srcTestTopoType, void *srcTopology, const double *srcEdgeData,
+                                    testTopologyType_t dstTestTopoType, void *refTopology, const double *refEdgeData){
+
+        int srcN=0, srcNNZ=0;
+        int refN=0, refNNZ=0;
+        topoGetN(srcTestTopoType, srcTopology, &srcN);
+        topoGetNNZ(srcTestTopoType, srcTopology, &srcNNZ);
+        topoGetN(dstTestTopoType, refTopology, &refN);
+        topoGetNNZ(dstTestTopoType, refTopology, &refNNZ);
+
+        // Allocate result Topology
+        T *dstEdgeDataT = (T*)malloc(sizeof(T)*refNNZ);
+        void *dstTopology=NULL;
+        allocateTopo(&dstTopology, dstTestTopoType, refN, refNNZ, HOST);
+        //////////////////////////////////////////////////
+
+        // Convert host edge data to template type
+        T *srcEdgeDataT = (T*)malloc(sizeof(T)*srcNNZ);
+        T *refEdgeDataT = (T*)malloc(sizeof(T)*refNNZ);
+        const double *pT=(const double*)srcEdgeData;
+        for(int i=0; i<srcNNZ; ++i)
+            srcEdgeDataT[i]=(T)pT[i];
+        pT=(const double*)refEdgeData;
+        for(int i=0; i<refNNZ; ++i)
+            refEdgeDataT[i]=(T)pT[i];
+        //////////////////////////////////////////////////
+        nvgraphTopologyType_t srcTType, dstTType;
+        srcTType = testType2nvGraphType(srcTestTopoType);
+        dstTType = testType2nvGraphType(dstTestTopoType);
+        refConvert(srcTType, srcTopology, srcEdgeDataT, dstTType, dstTopology, dstEdgeDataT);
+        cmpTopo(dstTType, refTopology, HOST, dstTopology, HOST);
+        cmpArray(refEdgeDataT, HOST, dstEdgeDataT, HOST, refNNZ);
+
+        free(srcEdgeDataT);
+        free(refEdgeDataT);
+        free(dstEdgeDataT);
+        deAllocateTopo(dstTopology, dstTestTopoType, HOST);
+    }
+
+    // nvgraph conversion test
+    template <typename T>
+    void nvgraphPresetConvertTest(testTopologyType_t srcTestTopoType, void *srcTopologyHst, const double *srcEdgeDataHst, cudaDataType_t *dataType,
+                                  testTopologyType_t dstTestTopoType, void *refTopologyHst, const double *refEdgeDataHst){
+
+        int srcN=0, srcNNZ=0;
+        int refN=0, refNNZ=0;
+        topoGetN(srcTestTopoType, srcTopologyHst, &srcN);
+        topoGetNNZ(srcTestTopoType, srcTopologyHst, &srcNNZ);
+        topoGetN(dstTestTopoType, refTopologyHst, &refN);
+        topoGetNNZ(dstTestTopoType, refTopologyHst, &refNNZ);
+
+        // Allocate topoplogies in device memory
+        void *srcTopologyDv=NULL, *dstTopologyDv=NULL;
+        allocateTopo(&srcTopologyDv, srcTestTopoType, refN, refNNZ, DEVICE);
+        allocateTopo(&dstTopologyDv, dstTestTopoType, refN, refNNZ, DEVICE);
+        cpyTopo(srcTopologyDv, srcTopologyHst, srcTestTopoType, cudaMemcpyHostToDevice); // Copy src topology to device
+        //////////////////////////////////////////////////
+
+        // Convert host edge data to template type
+        T *srcEdgeDataHstT = (T*)malloc(sizeof(T)*srcNNZ);
+        T *refEdgeDataHstT = (T*)malloc(sizeof(T)*refNNZ);
+        const double *pT=(const double*)srcEdgeDataHst;
+        for(int i=0; i<srcNNZ; ++i)
+            srcEdgeDataHstT[i]=(T)pT[i];
+        pT=(const double*)refEdgeDataHst;
+        for(int i=0; i<refNNZ; ++i)
+            refEdgeDataHstT[i]=(T)pT[i];
+        //////////////////////////////////////////////////
+
+        // Allocate edge data in device memory
+        T *srcEdgeDataDvT, *dstEdgeDataDvT;
+        ASSERT_EQ(cudaSuccess, cudaMalloc((void**)&srcEdgeDataDvT, sizeof(T)*srcNNZ));
+        ASSERT_EQ(cudaSuccess, cudaMalloc((void**)&dstEdgeDataDvT, sizeof(T)*refNNZ));
+        ASSERT_EQ(cudaSuccess, cudaMemcpy(srcEdgeDataDvT, srcEdgeDataHstT, sizeof(T)*srcNNZ, cudaMemcpyDefault)); // Copy edge data to device
+        //////////////////////////////////////////////////
+
+        nvgraphTopologyType_t srcTType, dstTType;
+        srcTType = testType2nvGraphType(srcTestTopoType);
+        dstTType = testType2nvGraphType(dstTestTopoType);
+        status = nvgraphConvertTopology(handle,
+                                srcTType, srcTopologyDv, srcEdgeDataDvT, dataType,
+                                dstTType, dstTopologyDv, dstEdgeDataDvT);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        cmpTopo(dstTType, refTopologyHst, HOST, dstTopologyDv, DEVICE);
+        cmpArray(refEdgeDataHstT, HOST, dstEdgeDataDvT, DEVICE, refNNZ);
+
+        free(srcEdgeDataHstT);
+        free(refEdgeDataHstT);
+        ASSERT_EQ(cudaSuccess, cudaFree(srcEdgeDataDvT));
+        ASSERT_EQ(cudaSuccess, cudaFree(dstEdgeDataDvT));
+        deAllocateTopo(srcTopologyDv, srcTestTopoType, DEVICE);
+        deAllocateTopo(dstTopologyDv, dstTestTopoType, DEVICE);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Helper functions
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////
+    static void getTestData(testTopologyType_t TType, void **topo, const void **edgeData, presetTestContainer_st prestTestContainer){
+        if(TType==CSR_32){
+            *topo = prestTestContainer.csrTopo;
+            *edgeData = prestTestContainer.csrEdgeData;
+        } else if(TType==CSC_32) {
+            *topo = prestTestContainer.cscTopo;
+            *edgeData = prestTestContainer.cscEdgeData;
+        } else if(TType==COO_SOURCE_32) {
+            *topo = prestTestContainer.coosTopo;
+            *edgeData = prestTestContainer.coosEdgeData;
+        } else if(TType==COO_DESTINATION_32) {
+            *topo = prestTestContainer.coodTopo;
+            *edgeData = prestTestContainer.coodEdgeData;
+        } else if(TType==COO_UNSORTED_32) {
+            *topo = prestTestContainer.coouTopo;
+            *edgeData = prestTestContainer.coouEdgeData;
+        } else if(TType==COO_DEFAULT_32) {
+            *topo = prestTestContainer.coouTopo;
+            *edgeData = prestTestContainer.coouEdgeData;
+        } else {
+            FAIL();
+        }
+    }
+
+};
+
+
+TEST_P(PresetTopology, referenceValidation) {
+
+    cudaDataType_t dataType = std::tr1::get<0>(GetParam());
+    testTopologyType_t srcTestTopoType = std::tr1::get<1>(GetParam());
+    testTopologyType_t dstTestTopoType = std::tr1::get<2>(GetParam());
+    presetTestContainer_st prestTestContainer = std::tr1::get<3>(GetParam());
+
+    if(dstTestTopoType==COO_UNSORTED_32)
+        return;
+
+    void *srcTopology=NULL, *refTopology=NULL;
+    const void *srcEdgeData=NULL, *refEdgeData=NULL;
+    this->getTestData(srcTestTopoType, &srcTopology, &srcEdgeData, prestTestContainer);
+    this->getTestData(dstTestTopoType, &refTopology, &refEdgeData, prestTestContainer);
+
+    if(dataType==CUDA_R_32F) {
+        this->refPrestConvertTest<float>(srcTestTopoType, srcTopology, (const double*)srcEdgeData,
+                                         dstTestTopoType, refTopology, (const double*)refEdgeData);
+    } else if (dataType==CUDA_R_64F) {
+        this->refPrestConvertTest<double>(srcTestTopoType, srcTopology, (const double*)srcEdgeData,
+                                          dstTestTopoType, refTopology, (const double*)refEdgeData);
+    } else {
+        FAIL();
+    }
+}
+
+
+TEST_P(PresetTopology, nvgraphConvertTopology) {
+
+    cudaDataType_t dataType = std::tr1::get<0>(GetParam());
+    testTopologyType_t srcTestTopoType = std::tr1::get<1>(GetParam());
+    testTopologyType_t dstTestTopoType = std::tr1::get<2>(GetParam());
+    presetTestContainer_st prestTestContainer = std::tr1::get<3>(GetParam());
+
+    if(dstTestTopoType==COO_UNSORTED_32)
+        return;
+
+    void *srcTopology=NULL, *refTopology=NULL;
+    const void *srcEdgeData=NULL, *refEdgeData=NULL;
+    this->getTestData(srcTestTopoType, &srcTopology, &srcEdgeData, prestTestContainer);
+    this->getTestData(dstTestTopoType, &refTopology, &refEdgeData, prestTestContainer);
+
+    if(dataType==CUDA_R_32F){
+        this->nvgraphPresetConvertTest<float>( srcTestTopoType, srcTopology, (const double*)srcEdgeData, &dataType,
+                                               dstTestTopoType, refTopology, (const double*)refEdgeData);
+    } else if (dataType==CUDA_R_64F) {
+        this->nvgraphPresetConvertTest<double>( srcTestTopoType, srcTopology, (const double*)srcEdgeData, &dataType,
+                                                dstTestTopoType, refTopology, (const double*)refEdgeData);
+    } else {
+        FAIL();
+    }
+}
+
+
+
+class RandomTopology : public NVGraphAPIConvertTest,
+                       public ::testing::WithParamInterface<std::tr1::tuple< cudaDataType_t,            // dataType
+                                                                             testTopologyType_t,        // srcTopoType
+                                                                             testTopologyType_t,        // dstTopoType
+                                                                             int,                       // n
+                                                                             int> > {                   // nnz
+  public:
+    virtual void SetUp() {
+        NVGraphAPIConvertTest::SetUp();
+    }
+    // nvgraph conversion check
+    template <typename T>
+    void nvgraphTopologyConvertTest(testTopologyType_t srcTestTopoType, void *srcTopologyHst, const double *srcEdgeDataHst,
+                                    cudaDataType_t *dataType, testTopologyType_t dstTestTopoType){
+        int srcN=0, srcNNZ=0;
+        topoGetN(srcTestTopoType, srcTopologyHst, &srcN);
+        topoGetNNZ(srcTestTopoType, srcTopologyHst, &srcNNZ);
+
+        // Allocate result space in host memory
+        T *refResultEdgeDataT=(T*)malloc(sizeof(T)*srcNNZ);
+        void *refResultTopologyHst=NULL;
+        allocateTopo(&refResultTopologyHst, dstTestTopoType, srcN, srcNNZ, HOST);
+        //////////////////////////////////////////////////
+
+        // Allocate topologies space in device memory
+        void *srcTopologyDv=NULL, *resultTopologyDv=NULL;
+        T *resultEdgeData=NULL;
+        ASSERT_EQ(cudaSuccess, cudaMalloc( (void**)&resultEdgeData, sizeof(T)*srcNNZ) );
+        allocateTopo(&srcTopologyDv, srcTestTopoType, srcN, srcNNZ, DEVICE);
+        allocateTopo(&resultTopologyDv, dstTestTopoType, srcN, srcNNZ, DEVICE);
+        cpyTopo(srcTopologyDv, srcTopologyHst, srcTestTopoType, cudaMemcpyHostToDevice); // Copy src topology to device
+        //////////////////////////////////////////////////
+
+        // Convert host edge data to template type
+        T *srcEdgeDataHstT = (T*)malloc(sizeof(T)*srcNNZ);
+        const double *pT=(const double*)srcEdgeDataHst;
+        for(int i=0; i<srcNNZ; ++i)
+            srcEdgeDataHstT[i]=(T)pT[i];
+        //////////////////////////////////////////////////
+
+        // Allocate edge data in device memory
+        T *srcEdgeDataDvT, *resultEdgeDataDvT;
+        ASSERT_EQ(cudaSuccess, cudaMalloc((void**)&srcEdgeDataDvT, sizeof(T)*srcNNZ));
+        ASSERT_EQ(cudaSuccess, cudaMalloc((void**)&resultEdgeDataDvT, sizeof(T)*srcNNZ));
+        ASSERT_EQ(cudaSuccess, cudaMemcpy(srcEdgeDataDvT, srcEdgeDataHstT, sizeof(T)*srcNNZ, cudaMemcpyDefault)); // Copy edge data to device
+        //////////////////////////////////////////////////
+
+        nvgraphTopologyType_t srcTType, dstTType;
+        srcTType = testType2nvGraphType(srcTestTopoType);
+        dstTType = testType2nvGraphType(dstTestTopoType);
+        refConvert(srcTType, srcTopologyHst, srcEdgeDataHstT, dstTType, refResultTopologyHst, refResultEdgeDataT); // Get reference result
+        status = nvgraphConvertTopology(handle,
+                                srcTType, srcTopologyDv, srcEdgeDataDvT, dataType,
+                                dstTType, resultTopologyDv, resultEdgeDataDvT);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        cmpTopo(dstTType, refResultTopologyHst, HOST, resultTopologyDv, DEVICE);
+        cmpArray(refResultEdgeDataT, HOST, resultEdgeDataDvT, DEVICE, srcNNZ);
+
+        free(refResultEdgeDataT);
+        free(srcEdgeDataHstT);
+        ASSERT_EQ(cudaSuccess, cudaFree(resultEdgeData));
+        ASSERT_EQ(cudaSuccess, cudaFree(srcEdgeDataDvT));
+        ASSERT_EQ(cudaSuccess, cudaFree(resultEdgeDataDvT));
+        deAllocateTopo(refResultTopologyHst, dstTestTopoType, HOST);
+        deAllocateTopo(srcTopologyDv, srcTestTopoType, DEVICE);
+        deAllocateTopo(resultTopologyDv, dstTestTopoType, DEVICE);
+    }
+
+
+    // nvgraph conversion check
+    template <typename T>
+    void nvgraphGraphConvertTest(testTopologyType_t srcTestTopoType, void *srcTopologyHst, const double *srcEdgeDataHst,
+                                 cudaDataType_t *dataType, testTopologyType_t dstTestTopoType){
+        int srcN=0, srcNNZ=0;
+        topoGetN(srcTestTopoType, srcTopologyHst, &srcN);
+        topoGetNNZ(srcTestTopoType, srcTopologyHst, &srcNNZ);
+
+        // Allocate result space in host memory
+        T *refResultEdgeDataT=(T*)malloc(sizeof(T)*srcNNZ);
+        void *refResultTopologyHst=NULL;
+        allocateTopo(&refResultTopologyHst, dstTestTopoType, srcN, srcNNZ, HOST);
+        //////////////////////////////////////////////////
+
+        // Allocate topologies space in device memory
+        void *srcTopologyDv=NULL, *resultTopologyDv=NULL;
+        T *resultEdgeData=NULL;
+        ASSERT_EQ(cudaSuccess, cudaMalloc( (void**)&resultEdgeData, sizeof(T)*srcNNZ) );
+        allocateTopo(&srcTopologyDv, srcTestTopoType, srcN, srcNNZ, DEVICE);
+        allocateTopo(&resultTopologyDv, dstTestTopoType, srcN, srcNNZ, DEVICE);
+        cpyTopo(srcTopologyDv, srcTopologyHst, srcTestTopoType, cudaMemcpyHostToDevice); // Copy src topology to device
+        //////////////////////////////////////////////////
+
+        // Convert host edge data to template type
+        T *srcEdgeDataHstT = (T*)malloc(sizeof(T)*srcNNZ);
+        const double *pT=(const double*)srcEdgeDataHst;
+        for(int i=0; i<srcNNZ; ++i)
+            srcEdgeDataHstT[i]=(T)pT[i];
+        //////////////////////////////////////////////////
+
+        // Allocate edge data in device memory
+        T *srcEdgeDataDvT, *resultEdgeDataDvT;
+        ASSERT_EQ(cudaSuccess, cudaMalloc((void**)&srcEdgeDataDvT, sizeof(T)*srcNNZ));
+        ASSERT_EQ(cudaSuccess, cudaMalloc((void**)&resultEdgeDataDvT, sizeof(T)*srcNNZ));
+        ASSERT_EQ(cudaSuccess, cudaMemcpy(srcEdgeDataDvT, srcEdgeDataHstT, sizeof(T)*srcNNZ, cudaMemcpyDefault)); // Copy edge data to device
+        //////////////////////////////////////////////////
+
+        nvgraphTopologyType_t srcTType, dstTType;
+        srcTType = testType2nvGraphType(srcTestTopoType);
+        dstTType = testType2nvGraphType(dstTestTopoType);
+        refConvert(srcTType, srcTopologyHst, srcEdgeDataHstT, dstTType, refResultTopologyHst, refResultEdgeDataT); // Get reference result
+        status = nvgraphConvertTopology(handle,
+                                srcTType, srcTopologyDv, srcEdgeDataDvT, dataType,
+                                dstTType, resultTopologyDv, resultEdgeDataDvT);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        cmpTopo(dstTType, refResultTopologyHst, HOST, resultTopologyDv, DEVICE);
+        cmpArray(refResultEdgeDataT, HOST, resultEdgeDataDvT, DEVICE, srcNNZ);
+
+        free(refResultEdgeDataT);
+        free(srcEdgeDataHstT);
+        ASSERT_EQ(cudaSuccess, cudaFree(resultEdgeData));
+        ASSERT_EQ(cudaSuccess, cudaFree(srcEdgeDataDvT));
+        ASSERT_EQ(cudaSuccess, cudaFree(resultEdgeDataDvT));
+        deAllocateTopo(refResultTopologyHst, dstTestTopoType, HOST);
+        deAllocateTopo(srcTopologyDv, srcTestTopoType, DEVICE);
+        deAllocateTopo(resultTopologyDv, dstTestTopoType, DEVICE);
+    }
+};
+
+
+TEST_P(RandomTopology, nvgraphConvertTopology) {
+
+    cudaDataType_t dataType = std::tr1::get<0>(GetParam());
+    testTopologyType_t srcTestTopoType = std::tr1::get<1>(GetParam());
+    testTopologyType_t dstTestTopoType = std::tr1::get<2>(GetParam());
+    int n = std::tr1::get<3>(GetParam());
+    int max_nnz = std::tr1::get<4>(GetParam());
+    int maxJump = (rand() % n)+1;
+    int maxPerRow = (rand() % max_nnz)+1;
+    int nnz;
+
+    void *srcTopology;
+    allocateTopo(&srcTopology, srcTestTopoType, n, max_nnz, HOST);
+    if(srcTestTopoType==CSR_32) {
+        nvgraphCSRTopology32I_t srcT = static_cast<nvgraphCSRTopology32I_t >(srcTopology);
+        randomCsrGenerator( srcT->source_offsets, srcT->destination_indices, &nnz, n,
+                            maxPerRow, maxJump, max_nnz);
+        srcT->nedges = nnz;
+    } else if(srcTestTopoType==CSC_32) {
+        nvgraphCSCTopology32I_t srcT = static_cast<nvgraphCSCTopology32I_t >(srcTopology);
+        randomCsrGenerator( srcT->destination_offsets, srcT->source_indices, &nnz, n,
+                            maxPerRow, maxJump, max_nnz);
+        srcT->nedges = nnz;
+    } else if(srcTestTopoType==COO_SOURCE_32) {
+        nvgraphCOOTopology32I_t srcT = static_cast<nvgraphCOOTopology32I_t >(srcTopology);
+        randomCOOGenerator( srcT->source_indices, srcT->destination_indices, &nnz, n,
+                            maxPerRow, maxJump, max_nnz);
+        srcT->nedges = nnz;
+    } else if(srcTestTopoType==COO_DESTINATION_32 || srcTestTopoType==COO_UNSORTED_32 || srcTestTopoType==COO_DEFAULT_32) {
+        // Unsorted and default to have COO_dest sorting. (sorted is a special case of unsorted array)
+        nvgraphCOOTopology32I_t srcT = static_cast<nvgraphCOOTopology32I_t >(srcTopology);
+        randomCOOGenerator( srcT->destination_indices, srcT->source_indices, &nnz, n,
+                            maxPerRow, maxJump, max_nnz);
+        srcT->nedges = nnz;
+    } else {
+        FAIL();
+    }
+
+    double *srcEdgeData = (double*)malloc(sizeof(double)*nnz);
+    for(int i=0; i<nnz; ++i)
+        srcEdgeData[i]=(double)rand()/(rand()+1); // don't divide by zero
+
+    if(dataType==CUDA_R_32F){
+        this->nvgraphTopologyConvertTest<float> (srcTestTopoType, srcTopology, srcEdgeData, &dataType, dstTestTopoType);
+    } else if (dataType==CUDA_R_64F) {
+        this->nvgraphTopologyConvertTest<double> (srcTestTopoType, srcTopology, srcEdgeData, &dataType, dstTestTopoType);
+    } else {
+        FAIL();
+    }
+    deAllocateTopo(srcTopology, srcTestTopoType, HOST);
+    free(srcEdgeData);
+}
+
+
+class RandomGraph : public NVGraphAPIConvertTest,
+                    public ::testing::WithParamInterface<std::tr1::tuple< cudaDataType_t,             // dataType
+                                                                          testTopologyType_t,         // srcTopoType
+                                                                          testTopologyType_t,         // dstTopoType
+                                                                          int,                        // n
+                                                                          int> > {                    // nnz
+  public:
+    nvgraphGraphDescr_t srcGrDesc, dstGrDesc, refGrDesc;
+    void *srcEdgeData, *dstEdgeData, *refEdgeData;
+    void *srcVertexData, *dstVertexData, *refVertexData;
+    void *srcTopology, *refTopology;
+    nvgraphTopologyType_t srcTopoType, dstTopoType;
+    testTopologyType_t srcTestTopoType, dstTestTopoType;
+    virtual void SetUp() {
+        NVGraphAPIConvertTest::SetUp();
+        status = nvgraphCreateGraphDescr(handle, &srcGrDesc);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphCreateGraphDescr(handle, &dstGrDesc);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphCreateGraphDescr(handle, &refGrDesc);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        srcEdgeData = NULL;
+        dstEdgeData = NULL;
+        refEdgeData = NULL;
+        srcVertexData = NULL;
+        dstVertexData = NULL;
+        refVertexData = NULL;
+
+        srcTopology = NULL;
+        refTopology = NULL;
+    }
+    virtual void TearDown() {
+        if(srcGrDesc!=NULL){
+            status = nvgraphDestroyGraphDescr(handle, srcGrDesc);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+        if(dstGrDesc!=NULL){
+            status = nvgraphDestroyGraphDescr(handle, dstGrDesc);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+        if(refGrDesc!=NULL){
+            status = nvgraphDestroyGraphDescr(handle, refGrDesc);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+        free(srcEdgeData);
+        free(dstEdgeData);
+        free(refEdgeData);
+        free(srcVertexData);
+        free(dstVertexData);
+        free(refVertexData);
+        deAllocateTopo(srcTopology, srcTestTopoType, HOST);
+        deAllocateTopo(refTopology, dstTestTopoType, HOST);
+        NVGraphAPIConvertTest::TearDown();
+    }
+};
+
+TEST_P(RandomGraph, nvgraphConvertGraph) {
+
+    cudaDataType_t dataType = std::tr1::get<0>(GetParam());
+    srcTestTopoType = std::tr1::get<1>(GetParam());
+    dstTestTopoType = std::tr1::get<2>(GetParam());
+    int n = std::tr1::get<3>(GetParam());
+    int max_nnz = std::tr1::get<4>(GetParam());
+    int maxJump = (rand() % n)+1;
+    int maxPerRow = (rand() % max_nnz)+1;
+    int nnz;
+
+    nvgraphTopologyType_t srcTopoType, dstTopoType;
+    srcTopoType = testType2nvGraphType(srcTestTopoType);
+    dstTopoType = testType2nvGraphType(dstTestTopoType);
+
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Prepare input graph
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////
+    allocateTopo(&srcTopology, srcTestTopoType, n, max_nnz, HOST);
+    if(srcTestTopoType==CSR_32) {
+        nvgraphCSRTopology32I_t srcT = static_cast<nvgraphCSRTopology32I_t >(srcTopology);
+        randomCsrGenerator( srcT->source_offsets, srcT->destination_indices, &nnz, n,
+                            maxPerRow, maxJump, max_nnz);
+        srcT->nedges = nnz;
+    } else if(srcTestTopoType==CSC_32) {
+        nvgraphCSCTopology32I_t srcT = static_cast<nvgraphCSCTopology32I_t >(srcTopology);
+        randomCsrGenerator( srcT->destination_offsets, srcT->source_indices, &nnz, n,
+                            maxPerRow, maxJump, max_nnz);
+        srcT->nedges = nnz;
+    } else if(srcTestTopoType==COO_SOURCE_32) {
+        nvgraphCOOTopology32I_t srcT = static_cast<nvgraphCOOTopology32I_t >(srcTopology);
+        randomCOOGenerator( srcT->source_indices, srcT->destination_indices, &nnz, n,
+                            maxPerRow, maxJump, max_nnz);
+        srcT->nedges = nnz;
+    } else if(srcTestTopoType==COO_DESTINATION_32 || srcTestTopoType==COO_UNSORTED_32 || srcTestTopoType==COO_DEFAULT_32) {
+        // Unsorted and default to have COO_dest sorting. (sorted is a special case of unsorted array)
+        nvgraphCOOTopology32I_t srcT = static_cast<nvgraphCOOTopology32I_t >(srcTopology);
+        randomCOOGenerator( srcT->destination_indices, srcT->source_indices, &nnz, n,
+                            maxPerRow, maxJump, max_nnz);
+        srcT->nedges = nnz;
+    } else {
+        FAIL();
+    }
+
+    status = nvgraphSetGraphStructure(handle, srcGrDesc, srcTopology, srcTopoType);
+    if(srcTopoType==NVGRAPH_CSR_32 || srcTopoType==NVGRAPH_CSC_32){
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    } else if (srcTopoType==NVGRAPH_COO_32){ // COO graph is not supported
+        ASSERT_EQ(NVGRAPH_STATUS_TYPE_NOT_SUPPORTED, status);
+        return;
+    } else {
+        FAIL();
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Prepeate data arrays
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////
+    if(dataType==CUDA_R_32F){
+        srcEdgeData = malloc(sizeof(float)*nnz);
+        dstEdgeData = malloc(sizeof(float)*nnz);
+        refEdgeData = malloc(sizeof(float)*nnz);
+        srcVertexData = malloc(sizeof(float)*n);
+        dstVertexData = malloc(sizeof(float)*n);
+        refVertexData = malloc(sizeof(float)*n);
+    }
+    else if (dataType==CUDA_R_64F){
+        srcEdgeData = malloc(sizeof(double)*nnz);
+        dstEdgeData = malloc(sizeof(double)*nnz);
+        refEdgeData = malloc(sizeof(double)*nnz);
+        srcVertexData = malloc(sizeof(double)*n);
+        dstVertexData = malloc(sizeof(double)*n);
+        refVertexData = malloc(sizeof(double)*n);
+    } else
+        FAIL();
+
+    if(srcEdgeData==NULL || dstEdgeData==NULL || refEdgeData==NULL)
+        FAIL();
+    if(srcVertexData==NULL || dstVertexData==NULL || refVertexData==NULL)
+        FAIL();
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Prepare reference graph
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////
+    allocateTopo(&refTopology, dstTestTopoType, n, nnz, HOST);
+    if(dataType==CUDA_R_32F)
+        refConvert( srcTopoType, srcTopology, (float*)srcEdgeData,
+                    dstTopoType, refTopology, (float*)refEdgeData ); // We don't care about edgeData
+    else if (dataType==CUDA_R_64F)
+        refConvert( srcTopoType, srcTopology, (double*)srcEdgeData,
+                    dstTopoType, refTopology, (double*)refEdgeData ); // We don't care about edgeData
+    else
+        FAIL();
+    status = nvgraphSetGraphStructure(handle, refGrDesc, refTopology, dstTopoType);
+    if( dstTopoType==NVGRAPH_CSR_32 || dstTopoType==NVGRAPH_CSC_32){
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    } else if (dstTopoType==NVGRAPH_COO_32) { // We don't support COO graphs
+        ASSERT_EQ(NVGRAPH_STATUS_TYPE_NOT_SUPPORTED, status);
+        return;
+    } else {
+        FAIL();
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Fill graph with vertex and edge data
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////
+    size_t edgeDataDim = (rand() % 11); // up to 10 edgeData sets
+    std::vector<cudaDataType_t> edgeDataType(edgeDataDim);
+    std::fill (edgeDataType.begin(), edgeDataType.end(), dataType);
+    status = nvgraphAllocateEdgeData( handle, srcGrDesc, edgeDataDim, edgeDataType.data());
+    if(edgeDataDim==0)
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+    else
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    status = nvgraphAllocateEdgeData( handle, refGrDesc, edgeDataDim, edgeDataType.data());
+    if(edgeDataDim==0)
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+    else
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    for(size_t i=0; i<edgeDataDim; ++i){
+        randomArray(nnz, srcEdgeData, &dataType);
+        // src Graph
+        status = nvgraphSetEdgeData(handle, srcGrDesc, srcEdgeData, i);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        // ref Graph (not the fastest approach, but I'm too lazy to do the permutation approach)
+        if(dataType==CUDA_R_32F)
+            refConvert( srcTopoType, srcTopology, (float*)srcEdgeData,
+                        dstTopoType, refTopology, (float*)refEdgeData );
+        else if (dataType==CUDA_R_64F)
+            refConvert( srcTopoType, srcTopology, (double*)srcEdgeData,
+                        dstTopoType, refTopology, (double*)refEdgeData );
+        else
+            FAIL();
+        status = nvgraphSetEdgeData(handle, refGrDesc, refEdgeData, i);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    }
+
+
+    size_t vertexDataDim = (rand() % 6); // up to 5 vertexData sets
+    std::vector<cudaDataType_t> vertexDataType(vertexDataDim);
+    std::fill (vertexDataType.begin(), vertexDataType.end(), dataType);
+    status = nvgraphAllocateVertexData( handle, srcGrDesc, vertexDataDim, vertexDataType.data());
+    if(vertexDataDim==0)
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+    else
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    status = nvgraphAllocateVertexData( handle, refGrDesc, vertexDataDim, vertexDataType.data());
+    if(vertexDataDim==0)
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+    else
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    for(size_t i=0; i<vertexDataDim; ++i){
+        randomArray(n, srcVertexData, &dataType);
+        // src Graph
+        status = nvgraphSetVertexData(handle, srcGrDesc, srcVertexData, i);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        status = nvgraphSetVertexData(handle, refGrDesc, srcVertexData, i);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    // Convert Graph
+    status = nvgraphConvertGraph(handle, srcGrDesc, dstGrDesc, dstTopoType);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+    // ///////////////////////////////////////////////////////////////////////////////////////////////////////
+    // // Compare
+    // ///////////////////////////////////////////////////////////////////////////////////////////////////////
+    int ref_nvertices, ref_nedges, dst_nvertices, dst_nedges;
+    int *dstOffset, *dstInd, *refOffset, *refInd;
+    if(dataType==CUDA_R_32F){
+        nvgraph::MultiValuedCsrGraph<int, float> *refMCSRG = static_cast<nvgraph::MultiValuedCsrGraph<int, float>*> (refGrDesc->graph_handle);
+        ref_nvertices = static_cast<int>(refMCSRG->get_num_vertices());
+        ref_nedges = static_cast<int>(refMCSRG->get_num_edges());
+        refOffset = refMCSRG->get_raw_row_offsets();
+        refInd = refMCSRG->get_raw_column_indices();
+
+        nvgraph::MultiValuedCsrGraph<int, float> *dstMCSRG = static_cast<nvgraph::MultiValuedCsrGraph<int, float>*> (dstGrDesc->graph_handle);
+        dst_nvertices = static_cast<int>(dstMCSRG->get_num_vertices());
+        dst_nedges = static_cast<int>(dstMCSRG->get_num_edges());
+        dstOffset = dstMCSRG->get_raw_row_offsets();
+        dstInd = dstMCSRG->get_raw_column_indices();
+    } else if (dataType==CUDA_R_64F) {
+        nvgraph::MultiValuedCsrGraph<int, double> *refMCSRG = static_cast<nvgraph::MultiValuedCsrGraph<int, double>*> (refGrDesc->graph_handle);
+        ref_nvertices = static_cast<int>(refMCSRG->get_num_vertices());
+        ref_nedges = static_cast<int>(refMCSRG->get_num_edges());
+        refOffset = refMCSRG->get_raw_row_offsets();
+        refInd = refMCSRG->get_raw_column_indices();
+
+        nvgraph::MultiValuedCsrGraph<int, double> *dstMCSRG = static_cast<nvgraph::MultiValuedCsrGraph<int, double>*> (dstGrDesc->graph_handle);
+        dst_nvertices = static_cast<int>(dstMCSRG->get_num_vertices());
+        dst_nedges = static_cast<int>(dstMCSRG->get_num_edges());
+        dstOffset = dstMCSRG->get_raw_row_offsets();
+        dstInd = dstMCSRG->get_raw_column_indices();
+    } else
+        FAIL();
+
+    ASSERT_EQ(ref_nvertices, dst_nvertices);
+    ASSERT_EQ(ref_nedges, dst_nedges);
+    cmpArray(refOffset, DEVICE, dstOffset, DEVICE, n+1);
+    cmpArray(refInd, DEVICE, dstInd, DEVICE, nnz);
+
+    for(size_t i=0; i<edgeDataDim; ++i){
+        status = nvgraphGetEdgeData(handle, refGrDesc, refEdgeData, i);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphGetEdgeData(handle, dstGrDesc, dstEdgeData, i);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        if(dataType==CUDA_R_32F)
+            cmpArray((float*)refEdgeData, HOST, (float*)dstEdgeData, HOST, nnz);
+        else if (dataType==CUDA_R_64F)
+            cmpArray((double*)refEdgeData, HOST, (double*)dstEdgeData, HOST, nnz);
+        else
+            FAIL();
+    }
+
+    for(size_t i=0; i<vertexDataDim; ++i){
+        status = nvgraphGetVertexData(handle, refGrDesc, refVertexData, i);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphGetVertexData(handle, dstGrDesc, dstVertexData, i);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        if(dataType==CUDA_R_32F)
+            cmpArray((float*)refVertexData, HOST, (float*)dstVertexData, HOST, n);
+        else if (dataType==CUDA_R_64F)
+            cmpArray((double*)refVertexData, HOST, (double*)dstVertexData, HOST, n);
+        else
+            FAIL();
+    }
+}
+
+cudaDataType_t DATA_TYPES[] = {CUDA_R_32F, CUDA_R_64F};
+testTopologyType_t SRC_TOPO_TYPES[] = {CSR_32, CSC_32, COO_SOURCE_32, COO_DESTINATION_32, COO_UNSORTED_32};
+testTopologyType_t DST_TOPO_TYPES[] = {CSR_32, CSC_32, COO_SOURCE_32, COO_DESTINATION_32, COO_UNSORTED_32};
+int ns[] = {10, 100, 1000, 50000, 100000, 200000, 300000, 456179, 500000, 1000000};
+int nnzs[] = {10, 100, 1000, 25000, 28943, 50000, 100000, 200000};
+
+INSTANTIATE_TEST_CASE_P(PresetTopologyConvertTest, PresetTopology,
+                        ::testing::Combine(
+                        ::testing::ValuesIn(DATA_TYPES),        // dataType
+                        ::testing::ValuesIn(SRC_TOPO_TYPES),    // srcTopoType
+                        ::testing::ValuesIn(DST_TOPO_TYPES),    // dstTopoType
+                        ::testing::ValuesIn(presetTests)        // testData
+                            ));
+
+INSTANTIATE_TEST_CASE_P(RandomTopologyConvertTest, RandomTopology,
+                        ::testing::Combine(
+                        ::testing::ValuesIn(DATA_TYPES),        // dataType
+                        ::testing::ValuesIn(SRC_TOPO_TYPES),    // srcTopoType
+                        ::testing::ValuesIn(DST_TOPO_TYPES),    // dstTopoType
+                        ::testing::ValuesIn(ns),                // n
+                        ::testing::ValuesIn(nnzs)               // nnz
+                            ));
+
+INSTANTIATE_TEST_CASE_P(RandomGraphConvertTest, RandomGraph,
+                        ::testing::Combine(
+                        ::testing::ValuesIn(DATA_TYPES),        // dataType
+                        ::testing::ValuesIn(SRC_TOPO_TYPES),    // srcTopoType
+                        ::testing::ValuesIn(DST_TOPO_TYPES),    // dstTopoType
+                        ::testing::ValuesIn(ns),                // n
+                        ::testing::ValuesIn(nnzs)               // nnz
+                            ));
+
+int main(int argc, char **argv){
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/cpp/nvgraph/cpp/tests/nvgraph_capi_tests_subgraph.cpp b/cpp/nvgraph/cpp/tests/nvgraph_capi_tests_subgraph.cpp
new file mode 100644
index 00000000000..84b3ff51a86
--- /dev/null
+++ b/cpp/nvgraph/cpp/tests/nvgraph_capi_tests_subgraph.cpp
@@ -0,0 +1,1123 @@
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include <functional>
+#include <iterator>
+#include <fstream>
+#include <cassert>
+#include <sstream>
+#include <string>
+#include <cstdio>
+
+#include "gtest/gtest.h"
+#include "valued_csr_graph.hxx"
+#include "nvgraphP.h"
+#include "nvgraph.h"
+
+static std::string ref_data_prefix = "";
+static std::string graph_data_prefix = "";
+
+std::string convert_to_local_path(const std::string& in_file)
+												{
+	std::string wstr = in_file;
+	if ((wstr != "dummy") & (wstr != ""))
+			{
+		std::string prefix;
+		if (graph_data_prefix.length() > 0)
+				{
+			prefix = graph_data_prefix;
+		}
+		else
+		{
+#ifdef _WIN32
+			//prefix = "C:\\mnt\\eris\\test\\matrices_collection\\";
+			prefix = "Z:\\matrices_collection\\";
+			std::replace(wstr.begin(), wstr.end(), '/', '\\');
+#else
+			prefix = "/mnt/nvgraph_test_data/";
+#endif
+		}
+		wstr = prefix + wstr;
+	}
+	return wstr;
+}
+
+//annonymus:
+namespace {
+
+	class file_read_error
+	{
+	public:
+		file_read_error(const std::string& msg) :
+				msg_(msg)
+		{
+			msg_ = std::string("File read error: ") + msg;
+		}
+		~file_read_error() {
+		}
+
+		const std::string& what() const {
+			return (msg_);
+		}
+	private:
+		std::string msg_;
+	};
+
+	template<typename Vector>
+	void fill_extraction_data(const std::string& fname,
+										Vector& g_row_offsets,
+										Vector& g_col_indices,
+										Vector& aggregates,
+										Vector& cg_row_offsets,
+										Vector& cg_col_indices)
+										{
+		typedef typename Vector::value_type T;
+		std::ifstream m_stream(fname.c_str(), std::ifstream::in);
+		std::string line;
+
+		if (!m_stream.is_open())
+		{
+			throw file_read_error(fname);
+		}
+
+		bool keep_going = !std::getline(m_stream, line).eof();
+
+		//debug:
+		//std::cout<<line<<std::endl;
+
+		if (!keep_going)
+			return;
+
+		char c;
+		int g_nrows = 0;
+		int g_nnz = 0;
+		std::sscanf(line.c_str(), "%c: nrows=%d, nnz=%d", &c, &g_nrows, &g_nnz);
+
+		//debug:
+		//std::cout<<c<<","<<g_nrows<<","<<g_nnz<<"\n";
+		int n_entries = g_nrows + 1;
+		g_row_offsets.reserve(n_entries);
+
+		//ignore next line:
+		//
+		if (!std::getline(m_stream, line))
+			return;
+
+		//read G row_offsets:
+		for (int i = 0; (i < n_entries) && keep_going; ++i)
+				{
+			T value(0);
+
+			keep_going = !std::getline(m_stream, line).eof();
+			std::stringstream ss(line);
+			ss >> value;
+			g_row_offsets.push_back(value);
+		}
+
+		//ignore next 2 lines:
+		//
+		if (!std::getline(m_stream, line) || !std::getline(m_stream, line))
+			return;
+
+		g_col_indices.reserve(g_nnz);
+
+		//read G col_indices:
+		for (int i = 0; (i < g_nnz) && keep_going; ++i)
+				{
+			T value(0);
+
+			keep_going = !std::getline(m_stream, line).eof();
+			std::stringstream ss(line);
+			ss >> value;
+			g_col_indices.push_back(value);
+		}
+
+		//ignore next line:
+		//
+		if (!std::getline(m_stream, line))
+			return;
+
+		//remove the following for extraction:
+		//{
+		if (!std::getline(m_stream, line))
+			return;
+		int n_aggs = 0;
+		std::sscanf(line.c_str(), "aggregate: size=%d", &n_aggs);
+
+		//assert( n_aggs == g_nrows );//not true for subgraph extraction!
+
+		aggregates.reserve(n_aggs);
+
+		//read aggregate:
+		for (int i = 0; (i < n_aggs) && keep_going; ++i)
+				{
+			T value(0);
+
+			keep_going = !std::getline(m_stream, line).eof();
+			std::stringstream ss(line);
+			ss >> value;
+			aggregates.push_back(value);
+		}
+		//} end remove code for extraction
+
+		if (!keep_going || !std::getline(m_stream, line))
+			return;
+		int cg_nrows = 0;
+		int cg_nnz = 0;
+		std::sscanf(line.c_str(), "result %c: nrows=%d, nnz=%d", &c, &cg_nrows, &cg_nnz);
+
+		//debug:
+		//std::cout<<c<<","<<cg_nrows<<","<<cg_nnz<<"\n";
+
+		//
+		//m_stream.close();//not really needed...destructor handles this
+		//return;
+
+		n_entries = cg_nrows + 1;
+		cg_row_offsets.reserve(n_entries);
+
+		//ignore next line:
+		//
+		if (!std::getline(m_stream, line))
+			return;
+
+		//read G row_offsets:
+		for (int i = 0; (i < n_entries) && keep_going; ++i)
+				{
+			T value(0);
+
+			keep_going = !std::getline(m_stream, line).eof();
+			std::stringstream ss(line);
+			ss >> value;
+			cg_row_offsets.push_back(value);
+		}
+
+		//ignore next 2 lines:
+		//
+		if (!std::getline(m_stream, line) || !std::getline(m_stream, line))
+			return;
+
+		cg_col_indices.reserve(cg_nnz);
+
+		//read G col_indices:
+		for (int i = 0; (i < cg_nnz) && keep_going; ++i)
+				{
+			T value(0);
+
+			keep_going = !std::getline(m_stream, line).eof();
+			std::stringstream ss(line);
+			ss >> value;
+			cg_col_indices.push_back(value);
+		}
+
+		m_stream.close();  //not really needed...destructor handles this
+	}
+
+	template<typename Vector>
+	bool check_diffs(const Vector& v1, const Vector& v2)
+							{
+		typedef typename Vector::value_type T;
+
+		Vector v(v1.size(), 0);
+		std::transform(v1.begin(), v1.end(),
+							v2.begin(),
+							v.begin(),
+							std::minus<T>());
+
+		if (std::find_if(v.begin(), v.end(), std::bind2nd(std::not_equal_to<T>(), 0)) != v.end())
+			return true;
+		else
+			return false;
+	}
+
+//check if sort(delta(r1)) == sort(delta(r2))
+//where delta(r)={r[i+1]-r[i] | i <- [0..|r|-1]}
+//
+	template<typename Vector>
+	bool check_delta_invariant(const Vector& r1, const Vector& r2)
+										{
+		typedef typename Vector::value_type T;
+
+		size_t sz = r1.size();
+		assert(sz == r2.size());
+
+		Vector d1(sz - 1);
+
+		std::transform(r1.begin() + 1, r1.end(),
+							r1.begin(),
+							d1.begin(),
+							std::minus<int>());
+
+		Vector d2(sz - 1);
+
+		std::transform(r2.begin() + 1, r2.end(),
+							r2.begin(),
+							d2.begin(),
+							std::minus<int>());
+
+		std::sort(d1.begin(), d1.end());
+		std::sort(d2.begin(), d2.end());
+
+		return (d1 == d2);
+	}
+}
+
+class NvgraphCAPITests_SubgraphCSR: public ::testing::Test {
+public:
+	NvgraphCAPITests_SubgraphCSR() :
+			nvgraph_handle(NULL), initial_graph(NULL) {
+	}
+
+protected:
+	static void SetupTestCase()
+	{
+	}
+	static void TearDownTestCase()
+	{
+	}
+	virtual void SetUp()
+	{
+		if (nvgraph_handle == NULL) {
+			status = nvgraphCreate(&nvgraph_handle);
+			ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		}
+		// set up graph
+		status = nvgraphCreateGraphDescr(nvgraph_handle, &initial_graph);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		nvgraphCSRTopology32I_st topoData;
+		topoData.nvertices = 5;
+		topoData.nedges = 9;
+		int neighborhood[] = { 0, 2, 3, 5, 7, 9 };
+		int edgedest[] = { 1, 3, 3, 1, 4, 0, 2, 2, 4 };
+		topoData.source_offsets = neighborhood;
+		topoData.destination_indices = edgedest;
+		status = nvgraphSetGraphStructure(	nvgraph_handle,
+														initial_graph,
+														(void*) &topoData,
+														NVGRAPH_CSR_32);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		// set up graph data
+		size_t numsets = 2;
+		float vertexvals0[] = { 0.1f, 0.15893e-20f, 1e27f, 13.2f, 0.f };
+		float vertexvals1[] = { 13., 322.64, 1e28, -1.4, 22.3 };
+		void* vertexptr[] = { (void*) vertexvals0, (void*) vertexvals1 };
+		cudaDataType_t type_v[] = { CUDA_R_32F, CUDA_R_32F };
+		float edgevals0[] = { 0.1f, 0.9153e-20f, 0.42e27f, 185.23, 1e21f, 15.6f, 215.907f, 912.2f,
+				0.2f };
+		float edgevals1[] = { 13., 322.64, 1e28, 197534.2, 0.1, 0.425e-5, 5923.4, 0.12e-12, 52. };
+		void* edgeptr[] = { (void*) edgevals0, (void*) edgevals1 };
+		cudaDataType_t type_e[] = { CUDA_R_32F, CUDA_R_32F };
+
+		status = nvgraphAllocateVertexData(nvgraph_handle, initial_graph, numsets, type_v);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		status = nvgraphSetVertexData(nvgraph_handle, initial_graph, (void *) vertexptr[0], 0);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		status = nvgraphSetVertexData(nvgraph_handle, initial_graph, (void *) vertexptr[1], 1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		status = nvgraphAllocateEdgeData(nvgraph_handle, initial_graph, numsets, type_e);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		status = nvgraphSetEdgeData(nvgraph_handle, initial_graph, (void *) edgeptr[0], 0);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		status = nvgraphSetEdgeData(nvgraph_handle, initial_graph, (void *) edgeptr[1], 1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		//save data - those will be available in the tests directly
+		graph_neigh.assign(neighborhood, neighborhood + topoData.nvertices + 1);
+		graph_edged.assign(edgedest, edgedest + topoData.nedges);
+		graph_vvals0.assign(vertexvals0, vertexvals0 + topoData.nvertices);
+		graph_vvals1.assign(vertexvals1, vertexvals1 + topoData.nvertices);
+		graph_evals0.assign(edgevals0, edgevals0 + topoData.nedges);
+		graph_evals1.assign(edgevals1, edgevals1 + topoData.nedges);
+	}
+	virtual void TearDown()
+	{
+		// destroy graph
+		status = nvgraphDestroyGraphDescr(nvgraph_handle, initial_graph);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		// release library
+		if (nvgraph_handle != NULL) {
+			status = nvgraphDestroy(nvgraph_handle);
+			ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+			nvgraph_handle = NULL;
+		}
+	}
+	nvgraphStatus_t status;
+	nvgraphHandle_t nvgraph_handle;
+	nvgraphGraphDescr_t initial_graph;
+
+	std::vector<int> graph_neigh;
+	std::vector<int> graph_edged;
+	std::vector<float> graph_vvals0;
+	std::vector<float> graph_vvals1;
+	std::vector<float> graph_evals0;
+	std::vector<float> graph_evals1;
+};
+
+TEST_F(NvgraphCAPITests_SubgraphCSR, CSRSubgraphVertices_Sanity)
+{
+	nvgraphStatus_t status;
+	nvgraphGraphDescr_t temp_graph1 = NULL, temp_graph2 = NULL;
+
+	float getVvals0[4];
+	float getVvals1[4];
+	float getEvals0[4];
+	float getEvals1[4];
+
+	{
+		status = nvgraphCreateGraphDescr(nvgraph_handle, &temp_graph2);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		int vertices[] = { 2, 4 };
+		status = nvgraphExtractSubgraphByVertex(	nvgraph_handle,
+																initial_graph,
+																temp_graph2,
+																vertices,
+																2);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		nvgraphCSRTopology32I_st tData;
+		int tData_source_offsets[3], tData_destination_indices[3];
+		tData.source_offsets = tData_source_offsets;
+		tData.destination_indices = tData_destination_indices;
+		nvgraphTopologyType_t TT;
+		status = nvgraphGetGraphStructure(nvgraph_handle, temp_graph2, (void*) &tData, &TT);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		ASSERT_EQ(TT, NVGRAPH_CSR_32);
+		ASSERT_EQ(tData.nvertices, 2);
+		ASSERT_EQ(tData.nedges, 3);
+
+		status = nvgraphGetVertexData(nvgraph_handle, temp_graph2, (void *) getVvals0, 0);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		status = nvgraphGetVertexData(nvgraph_handle, temp_graph2, (void *) getVvals1, 1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		status = nvgraphGetEdgeData(nvgraph_handle, temp_graph2, (void *) getEvals0, 0);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		status = nvgraphGetEdgeData(nvgraph_handle, temp_graph2, (void *) getEvals1, 1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		// we are extracting two vertices, but we are not sure which of them will be #0 and which will be #1
+		// we are comparing vertex values to determine that and handle both cases
+		if (getVvals0[0] == graph_vvals0[vertices[0]])
+				//vertex #0 in new graph - vertex #2 in old graph
+				//vertex #1 in new graph - vertex #4 in old graph
+				{
+			// check that vertex values are extracted correctly
+			ASSERT_EQ(getVvals0[0], graph_vvals0[vertices[0]]);
+			ASSERT_EQ(getVvals1[0], graph_vvals1[vertices[0]]);
+			ASSERT_EQ(getVvals0[1], graph_vvals0[vertices[1]]);
+			ASSERT_EQ(getVvals1[1], graph_vvals1[vertices[1]]);
+			// check that edge values are extracted correctly
+			ASSERT_EQ(getEvals0[0], graph_evals0[4]);
+			ASSERT_EQ(getEvals0[1], graph_evals0[7]);
+			ASSERT_EQ(getEvals0[2], graph_evals0[8]);
+			ASSERT_EQ(getEvals1[0], graph_evals1[4]);
+			ASSERT_EQ(getEvals1[1], graph_evals1[7]);
+			ASSERT_EQ(getEvals1[2], graph_evals1[8]);
+			// Check structure
+			ASSERT_EQ(tData.source_offsets[0], 0);
+			ASSERT_EQ(tData.source_offsets[1], 1);
+			ASSERT_EQ(tData.source_offsets[2], 3);
+			ASSERT_EQ(tData.destination_indices[0], 1);
+			ASSERT_EQ(tData.destination_indices[1], 0);
+			ASSERT_EQ(tData.destination_indices[2], 1);
+		}
+
+		//vertex #0 in new graph - vertex #4 in old graph
+		//vertex #1 in new graph - vertex #2 in old graph
+		else
+		{
+			// check that vertex values are extracted correctly
+			ASSERT_EQ(getVvals0[0], graph_vvals0[vertices[1]]);
+			ASSERT_EQ(getVvals0[1], graph_vvals0[vertices[0]]);
+			ASSERT_EQ(getVvals1[0], graph_vvals1[vertices[1]]);
+			ASSERT_EQ(getVvals1[1], graph_vvals1[vertices[0]]);
+			// check that edge values are extracted correctly
+			ASSERT_EQ(getEvals0[0], graph_evals0[7]);
+			ASSERT_EQ(getEvals0[1], graph_evals0[8]);
+			ASSERT_EQ(getEvals0[2], graph_evals0[4]);
+			ASSERT_EQ(getEvals1[0], graph_evals1[7]);
+			ASSERT_EQ(getEvals1[1], graph_evals1[8]);
+			ASSERT_EQ(getEvals1[2], graph_evals1[4]);
+			// check structure
+			ASSERT_EQ(tData.source_offsets[0], 0);
+			ASSERT_EQ(tData.source_offsets[1], 2);
+			ASSERT_EQ(tData.source_offsets[2], 3);
+			ASSERT_EQ(tData.destination_indices[0], 0);
+			ASSERT_EQ(tData.destination_indices[1], 1);
+			ASSERT_EQ(tData.destination_indices[2], 0);
+		}
+
+		status = nvgraphDestroyGraphDescr(nvgraph_handle, temp_graph2);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	}
+
+	//@TODO: how to check extracting by multiple vertices? do we preserve order of vertices/edges?
+	//@TODO: this would make sense only if vertices order is perserved in the extracted subgraph
+	int vertices[4] = { 0, 1, 3, 4 };
+	status = nvgraphCreateGraphDescr(nvgraph_handle, &temp_graph1);
+	ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	status = nvgraphExtractSubgraphByVertex(nvgraph_handle, initial_graph, temp_graph1, vertices, 3);
+	ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	/*size_t nverts1 = 0, nedges1 = 0;
+	 int neighborget[5];
+	 ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	 ASSERT_EQ(nverts1, 4);
+	 status = nvgraphGetGraphNedges(nvgraph_handle, temp_graph1, &nedges1);
+	 ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	 ASSERT_EQ(nedges1, 4);
+
+	 // check structure:
+	 status = nvgraphGetGraphNeighborhood(nvgraph_handle, temp_graph1, neighborget);
+	 ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	 ASSERT_EQ(neighborget[0], 0);
+	 ASSERT_EQ(neighborget[1], 2);
+	 ASSERT_EQ(neighborget[2], 3);
+	 ASSERT_EQ(neighborget[3], 4);
+	 ASSERT_EQ(neighborget[4], 4);
+
+	 int edgeget[4];
+	 status = nvgraphGetGraphEdgeDest( nvgraph_handle, temp_graph1, edgeget);
+	 ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	 ASSERT_EQ(edgeget[0], 1);
+	 ASSERT_EQ(edgeget[1], 3);
+	 ASSERT_EQ(edgeget[2], 3);
+	 ASSERT_EQ(edgeget[3], 0);
+
+	 // check values
+	 status = nvgraphGetVertexData(nvgraph_handle, temp_graph1, (void *)getVvals0, 0);
+	 ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	 ASSERT_EQ(getVvals0[0], vertexvals0[vertices[0]]);
+	 ASSERT_EQ(getVvals0[1], vertexvals0[vertices[1]]);
+	 ASSERT_EQ(getVvals0[2], vertexvals0[vertices[2]]);
+	 ASSERT_EQ(getVvals0[3], vertexvals0[vertices[3]]);
+	 status = nvgraphGetVertexData(nvgraph_handle, temp_graph1, (void *)getVvals1, 1);
+	 ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	 ASSERT_EQ(getVvals1[0], vertexvals1[vertices[0]]);
+	 ASSERT_EQ(getVvals1[1], vertexvals1[vertices[1]]);
+	 ASSERT_EQ(getVvals1[2], vertexvals1[vertices[2]]);
+	 ASSERT_EQ(getVvals1[3], vertexvals1[vertices[3]]);
+
+	 status = nvgraphGetEdgeData(nvgraph_handle, temp_graph1, (void *)getEvals0, 0);
+	 ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	 ASSERT_EQ(getEvals0[0], edgevals0[0]);
+	 ASSERT_EQ(getEvals0[1], edgevals0[1]);
+	 ASSERT_EQ(getEvals0[2], edgevals0[2]);
+	 ASSERT_EQ(getEvals0[3], edgevals0[6]);
+	 status = nvgraphGetEdgeData(nvgraph_handle, temp_graph1, (void *)getEvals1, 1);
+	 ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	 ASSERT_EQ(getEvals1[0], edgevals1[0]);
+	 ASSERT_EQ(getEvals1[1], edgevals1[1]);
+	 ASSERT_EQ(getEvals1[2], edgevals1[2]);
+	 ASSERT_EQ(getEvals1[3], edgevals1[6]);*/
+
+	status = nvgraphDestroyGraphDescr(nvgraph_handle, temp_graph1);
+	ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+}
+
+TEST_F(NvgraphCAPITests_SubgraphCSR, CSRSubgraphVertices_CornerCases)
+{
+	nvgraphStatus_t status;
+
+	nvgraphGraphDescr_t temp_graph1 = NULL, temp_graph2 = NULL;
+	float getVvals0[4];
+	float getVvals1[4];
+	float getEvals0[4];
+	float getEvals1[4];
+// failures
+	{
+		int vertices[2] = { 1, 3 };
+		status = nvgraphCreateGraphDescr(nvgraph_handle, &temp_graph1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		// bad library nvgraph_handle
+		status = nvgraphExtractSubgraphByEdge(NULL, initial_graph, temp_graph1, vertices, 1);
+		ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+
+		// bad descriptor 1
+		status = nvgraphExtractSubgraphByEdge(nvgraph_handle, temp_graph2, temp_graph1, vertices, 1);
+		ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+
+		// bad descriptor 2
+		status = nvgraphExtractSubgraphByEdge(	nvgraph_handle,
+															initial_graph,
+															temp_graph2,
+															vertices,
+															1);
+		ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+
+		// NULL pointer
+		status = nvgraphExtractSubgraphByEdge(	nvgraph_handle,
+															initial_graph,
+															temp_graph1,
+															(int*) NULL,
+															1);
+		ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+
+		// extract zero vertices - failure expected
+		status = nvgraphExtractSubgraphByVertex(	nvgraph_handle,
+																initial_graph,
+																temp_graph1,
+																vertices,
+																0);
+		ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+
+		// extracting vertices more than in original graph - failure expected
+		int too_many_vertices[] = { 0, 1, 2, 3, 4, 5, 10, 15 };
+		status = nvgraphExtractSubgraphByVertex(	nvgraph_handle,
+																initial_graph,
+																temp_graph1,
+																too_many_vertices,
+																8);
+		ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+
+		// unexisting indices - failure expected
+		int bad_vertices[] = { -1, 2, 15 };
+		status = nvgraphExtractSubgraphByVertex(	nvgraph_handle,
+																initial_graph,
+																temp_graph1,
+																bad_vertices,
+																3);
+		ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+
+		status = nvgraphDestroyGraphDescr(nvgraph_handle, temp_graph1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	}
+
+	// Not connected vertices
+	{
+		int vertices[] = { 0, 2 };
+		status = nvgraphCreateGraphDescr(nvgraph_handle, &temp_graph1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		status = nvgraphExtractSubgraphByVertex(	nvgraph_handle,
+																initial_graph,
+																temp_graph1,
+																vertices,
+																2);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		nvgraphCSRTopology32I_st tData;
+		tData.source_offsets = NULL;
+		tData.destination_indices = NULL;
+		nvgraphTopologyType_t TT;
+		status = nvgraphGetGraphStructure(nvgraph_handle, temp_graph1, (void*) &tData, &TT);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		ASSERT_EQ(TT, NVGRAPH_CSR_32);
+		ASSERT_EQ(tData.nvertices, 2);
+		ASSERT_EQ(tData.nedges, 0);
+
+		status = nvgraphDestroyGraphDescr(nvgraph_handle, temp_graph1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	}
+
+	// extract vertex that has edge to itself
+	{
+		int vertices[] = { 4 };
+		status = nvgraphCreateGraphDescr(nvgraph_handle, &temp_graph1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		status = nvgraphExtractSubgraphByVertex(	nvgraph_handle,
+																initial_graph,
+																temp_graph1,
+																vertices,
+																1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		nvgraphCSRTopology32I_st tData;
+		tData.source_offsets = NULL;
+		tData.destination_indices = NULL;
+		status = nvgraphGetGraphStructure(nvgraph_handle, temp_graph1, (void*) &tData, NULL);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		ASSERT_EQ(tData.nvertices, 1);
+		ASSERT_EQ(tData.nedges, 1);
+
+		status = nvgraphGetGraphStructure(nvgraph_handle, temp_graph1, (void*) &tData, NULL);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		status = nvgraphGetVertexData(nvgraph_handle, temp_graph1, (void *) getVvals0, 0);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		status = nvgraphGetVertexData(nvgraph_handle, temp_graph1, (void *) getVvals1, 1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		status = nvgraphGetEdgeData(nvgraph_handle, temp_graph1, (void *) getEvals0, 0);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		status = nvgraphGetEdgeData(nvgraph_handle, temp_graph1, (void *) getEvals1, 1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		ASSERT_EQ(getVvals0[0], graph_vvals0[vertices[0]]);
+		ASSERT_EQ(getVvals1[0], graph_vvals1[vertices[0]]);
+		ASSERT_EQ(getEvals0[0], graph_evals0[graph_evals0.size() - 1]);
+		ASSERT_EQ(getEvals1[0], graph_evals1[graph_evals0.size() - 1]);
+
+		status = nvgraphDestroyGraphDescr(nvgraph_handle, temp_graph1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	}
+
+	// extract whole graph
+	{
+		int vertices[] = { 0, 1, 2, 3, 4 };
+		status = nvgraphCreateGraphDescr(nvgraph_handle, &temp_graph1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		status = nvgraphExtractSubgraphByVertex(	nvgraph_handle,
+																initial_graph,
+																temp_graph1,
+																vertices,
+																5);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		nvgraphCSRTopology32I_st tData;
+		tData.source_offsets = NULL;
+		tData.destination_indices = NULL;
+		status = nvgraphGetGraphStructure(nvgraph_handle, temp_graph1, (void*) &tData, NULL);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		ASSERT_EQ(tData.nvertices, (int )graph_vvals0.size());
+		ASSERT_EQ(tData.nedges, (int )graph_evals0.size());
+
+		status = nvgraphDestroyGraphDescr(nvgraph_handle, temp_graph1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	}
+}
+
+TEST_F(NvgraphCAPITests_SubgraphCSR, CSRSubgraphEdges_Sanity)
+{
+	nvgraphStatus_t status;
+
+	nvgraphGraphDescr_t temp_graph1 = NULL, temp_graph2 = NULL;
+	status = nvgraphCreateGraphDescr(nvgraph_handle, &temp_graph1);
+	ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+	float getVvals0[4];
+	float getVvals1[4];
+	float getEvals0[4];
+	float getEvals1[4];
+
+	// for all edges: try to extract graph using only 1 edge
+	{
+		for (int r = 0; r < (int) graph_vvals0.size() /* == nvertices */; r++)
+				{
+			for (int e = graph_neigh[r]; e < graph_neigh[r + 1]; e++)
+					{
+				status = nvgraphCreateGraphDescr(nvgraph_handle, &temp_graph2);
+				ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+				status = nvgraphExtractSubgraphByEdge(	nvgraph_handle,
+																	initial_graph,
+																	temp_graph2,
+																	&e,
+																	1);
+				ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+				nvgraphCSRTopology32I_st tData;
+				tData.source_offsets = NULL;
+				tData.destination_indices = NULL;
+				status = nvgraphGetGraphStructure(nvgraph_handle, temp_graph2, (void*) &tData, NULL);
+				ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+				status = nvgraphGetVertexData(nvgraph_handle, temp_graph2, (void *) getVvals0, 0);
+				ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+				status = nvgraphGetVertexData(nvgraph_handle, temp_graph2, (void *) getVvals1, 1);
+				ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+				status = nvgraphGetEdgeData(nvgraph_handle, temp_graph2, (void *) getEvals0, 0);
+				ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+				status = nvgraphGetEdgeData(nvgraph_handle, temp_graph2, (void *) getEvals1, 1);
+				ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+				// check structure - should always be 1 edge and 2 vertices, special case for the last edge, because it is from vertex #5 to itself
+				if (e != (int) graph_evals0.size() - 1)
+						{
+					// check structure
+					ASSERT_EQ(tData.nvertices, 2)<< "Row : " << r << ", Edge : " << e;
+					ASSERT_EQ(tData.nedges, 1) << "Row : " << r << ", Edge : " << e;
+					// check vertex data
+					ASSERT_TRUE((getVvals0[0] == graph_vvals0[r]) || (getVvals0[0] == graph_vvals0[graph_edged[e]])) << getVvals0[0] << " " << graph_vvals0[r] << " " << graph_vvals0[graph_edged[e]];
+					ASSERT_TRUE((getVvals0[1] == graph_vvals0[r]) || (getVvals0[1] == graph_vvals0[graph_edged[e]])) << getVvals0[1] << " " << graph_vvals0[r] << " " << graph_vvals0[graph_edged[e]];
+					ASSERT_TRUE(getVvals0[0] != getVvals0[1]) << getVvals0[0] << " " << getVvals0[1];
+					ASSERT_TRUE((getVvals1[0] == graph_vvals1[r]) || (getVvals1[0] == graph_vvals1[graph_edged[e]])) << getVvals1[0] << " " << graph_vvals1[r] << " " << graph_vvals1[graph_edged[e]];
+					ASSERT_TRUE((getVvals1[1] == graph_vvals1[r]) || (getVvals1[1] == graph_vvals1[graph_edged[e]])) << getVvals1[1] << " " << graph_vvals1[r] << " " << graph_vvals1[graph_edged[e]];
+					ASSERT_TRUE(getVvals1[0] != getVvals1[1]) << getVvals1[0] << " " << getVvals1[1];
+				}
+				else // special case for the last edge - from last vertex to itself
+				{
+					// check structure
+					ASSERT_EQ(tData.nvertices, 1) << "Row : " << r << ", Edge : " << e;
+					ASSERT_EQ(tData.nedges, 1) << "Row : " << r << ", Edge : " << e;
+					// check vertex data
+					ASSERT_TRUE(getVvals0[0] == graph_vvals0[r]) << getVvals0[0] << " " << graph_vvals0[r];
+					ASSERT_TRUE(getVvals1[0] == graph_vvals1[r]) << getVvals1[0] << " " << graph_vvals1[r];
+				}
+				// check edge data
+				ASSERT_EQ(getEvals0[0], graph_evals0[e])<< getEvals0[0] << " " << graph_evals0[e];
+				ASSERT_EQ(getEvals1[0], graph_evals1[e])<< getEvals1[0] << " " << graph_evals1[e];
+
+				status = nvgraphDestroyGraphDescr(nvgraph_handle, temp_graph2);
+				ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+			}
+		}
+	}
+
+	//@TODO: we need somehow check extraction by multiple edges
+	//@TODO: this would make sense only if vertices order is perserved in the extracted subgraph
+	int edges[2] = { 1, 3 };
+	status = nvgraphExtractSubgraphByEdge(nvgraph_handle, initial_graph, temp_graph1, edges, 2);
+	ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	/*size_t nverts1 = 0, nedges1 = 0;
+	 status = nvgraphGetGraphNvertices(nvgraph_handle, temp_graph1, &nverts1);
+	 ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	 ASSERT_EQ(nverts1, 3);
+	 status = nvgraphGetGraphNedges(nvgraph_handle, temp_graph1, &nedges1);
+	 ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	 ASSERT_EQ(nedges1, 2);
+
+	 // check structure:
+	 int neighborget[4];
+	 status = nvgraphGetGraphNeighborhood(nvgraph_handle, temp_graph1, neighborget);
+	 ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	 ASSERT_EQ(neighborget[0], 0);
+	 ASSERT_EQ(neighborget[1], 1);
+	 ASSERT_EQ(neighborget[2], 2);
+	 ASSERT_EQ(neighborget[3], 2);
+	 int edgeget[2];
+	 status = nvgraphGetGraphEdgeDest( nvgraph_handle, temp_graph1, edgeget);
+	 ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	 ASSERT_EQ(edgeget[0], 2);
+	 ASSERT_EQ(edgeget[1], 0);
+
+	 status = nvgraphGetVertexData(nvgraph_handle, temp_graph1, (void *)getVvals0, 0);
+	 ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	 ASSERT_EQ(getVvals0[0], vertexvals0[0]);
+	 ASSERT_EQ(getVvals0[1], vertexvals0[2]);
+	 ASSERT_EQ(getVvals0[2], vertexvals0[3]);
+	 status = nvgraphGetVertexData(nvgraph_handle, temp_graph1, (void *)getVvals1, 1);
+	 ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	 ASSERT_EQ(getVvals1[0], vertexvals1[0]);
+	 ASSERT_EQ(getVvals1[1], vertexvals1[2]);
+	 ASSERT_EQ(getVvals1[2], vertexvals1[3]);
+
+	 status = nvgraphGetEdgeData(nvgraph_handle, temp_graph1, (void *)getEvals0, 0);
+	 ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	 ASSERT_EQ(getEvals0[0], edgevals0[edges[0]]);
+	 ASSERT_EQ(getEvals0[1], edgevals0[edges[1]]);
+	 status = nvgraphGetEdgeData(nvgraph_handle, temp_graph1, (void *)getEvals1, 1);
+	 ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	 ASSERT_EQ(getEvals1[0], edgevals1[edges[0]]);
+	 ASSERT_EQ(getEvals1[1], edgevals1[edges[1]]);*/
+
+	status = nvgraphDestroyGraphDescr(nvgraph_handle, temp_graph1);
+	ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+}
+
+TEST_F(NvgraphCAPITests_SubgraphCSR, CSRSubgraphEdges_CornerCases)
+{
+	nvgraphStatus_t status;
+
+	nvgraphGraphDescr_t temp_graph1 = NULL, temp_graph2 = NULL;
+
+// expected failures
+	{
+		int edges[2] = { 1, 3 };
+		status = nvgraphCreateGraphDescr(nvgraph_handle, &temp_graph1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		// bad library nvgraph_handle
+		status = nvgraphExtractSubgraphByEdge(NULL, initial_graph, temp_graph2, edges, 1);
+		ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+		// bad descriptor 1
+		status = nvgraphExtractSubgraphByEdge(nvgraph_handle, temp_graph2, temp_graph1, edges, 1);
+		ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+		// bad descriptor 2
+		status = nvgraphExtractSubgraphByEdge(nvgraph_handle, initial_graph, temp_graph2, edges, 1);
+		ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+		// NULL pointer
+		status = nvgraphExtractSubgraphByEdge(	nvgraph_handle,
+															initial_graph,
+															temp_graph1,
+															(int*) NULL,
+															1);
+		ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+
+		// extract zero edges - failure expected
+		status = nvgraphExtractSubgraphByEdge(nvgraph_handle, initial_graph, temp_graph1, edges, 0);
+		ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+
+		// bad edge number - in the C API we ask array consist of existing col_indices
+		int bad_edge[1] = { -10 };
+		status = nvgraphExtractSubgraphByEdge(	nvgraph_handle,
+															initial_graph,
+															temp_graph1,
+															bad_edge,
+															1);
+		ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+
+		// more edges than exists in the graph - in the C API we ask array consist of existing col_indices
+		int too_many_edges[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+		status = nvgraphExtractSubgraphByEdge(	nvgraph_handle,
+															initial_graph,
+															temp_graph1,
+															too_many_edges,
+															10);
+		ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+
+		status = nvgraphDestroyGraphDescr(nvgraph_handle, temp_graph1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	}
+
+	// not connected edges, which should create not connected graph
+	{
+		int edges[2] = { 0, 8 };
+		status = nvgraphCreateGraphDescr(nvgraph_handle, &temp_graph1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		status = nvgraphExtractSubgraphByEdge(nvgraph_handle, initial_graph, temp_graph1, edges, 2);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		nvgraphCSRTopology32I_st tData;
+		tData.source_offsets = NULL;
+		tData.destination_indices = NULL;
+		status = nvgraphGetGraphStructure(nvgraph_handle, temp_graph1, (void*) &tData, NULL);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		// we extracting 2 edges: one between two vertices and another is from third vertex to itself
+		ASSERT_EQ(tData.nvertices, 3);
+		ASSERT_EQ(tData.nedges, 2);
+
+		status = nvgraphDestroyGraphDescr(nvgraph_handle, temp_graph1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	}
+
+	// triangle.
+	{
+		int edges[2] = { 0, 2 };
+		status = nvgraphCreateGraphDescr(nvgraph_handle, &temp_graph1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		status = nvgraphExtractSubgraphByEdge(nvgraph_handle, initial_graph, temp_graph1, edges, 2);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		nvgraphCSRTopology32I_st tData;
+		tData.source_offsets = NULL;
+		tData.destination_indices = NULL;
+		status = nvgraphGetGraphStructure(nvgraph_handle, temp_graph1, (void*) &tData, NULL);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		// we extracting 2 edges, expecting new graph have 3 vertices and only 2 edges
+		ASSERT_EQ(tData.nvertices, 3);
+		ASSERT_EQ(tData.nedges, 2);
+
+		status = nvgraphDestroyGraphDescr(nvgraph_handle, temp_graph1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	}
+
+	// extract by edge to the self
+	{
+		int edges[1] = { 8 };
+		status = nvgraphCreateGraphDescr(nvgraph_handle, &temp_graph1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		status = nvgraphExtractSubgraphByEdge(nvgraph_handle, initial_graph, temp_graph1, edges, 1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		nvgraphCSRTopology32I_st tData;
+		tData.source_offsets = NULL;
+		tData.destination_indices = NULL;
+		status = nvgraphGetGraphStructure(nvgraph_handle, temp_graph1, (void*) &tData, NULL);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		// we extracting 1 edge to the vertex itself, expecting new graph have only 1 vertex and 1 edge
+		ASSERT_EQ(tData.nvertices, 1);
+		ASSERT_EQ(tData.nedges, 1);
+
+		status = nvgraphDestroyGraphDescr(nvgraph_handle, temp_graph1);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	}
+}
+
+TEST_F(NvgraphCAPITests_SubgraphCSR, CSRContractionNetworkX)
+{
+	nvgraphStatus_t status;
+
+	try {
+		nvgraphGraphDescr_t netx_graph = NULL;
+		nvgraphGraphDescr_t extracted_graph = NULL;
+
+		status = nvgraphCreateGraphDescr(nvgraph_handle, &netx_graph);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		status = nvgraphCreateGraphDescr(nvgraph_handle, &extracted_graph);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		std::string fname(convert_to_local_path("graphs/networkx/extr_test.dat"));
+
+		std::vector<int> g_row_offsets;
+		std::vector<int> g_col_indices;
+
+		std::vector<int> aggregates;
+		std::vector<int> cg_row_offsets;
+		std::vector<int> cg_col_indices;
+
+		fill_extraction_data(fname,
+									g_row_offsets,
+									g_col_indices,
+									aggregates,
+									cg_row_offsets,
+									cg_col_indices);
+
+		//std::cout<<"********* step 1: \n";
+
+		ASSERT_EQ(g_row_offsets.empty(), false);
+		ASSERT_EQ(g_col_indices.empty(), false);
+		ASSERT_EQ(aggregates.empty(), false);
+		ASSERT_EQ(cg_row_offsets.empty(), false);
+		ASSERT_EQ(cg_col_indices.empty(), false);
+
+		//std::cout<<"********* step 1.1: \n";
+
+		ASSERT_EQ(g_col_indices.size(), g_row_offsets.back());
+		ASSERT_EQ(cg_col_indices.size(), cg_row_offsets.back());
+
+		//std::cout<<"********* step 1.2: \n";
+
+		nvgraphCSRTopology32I_st topoData;
+		topoData.nvertices = g_row_offsets.size() - 1;    //last is nnz
+		topoData.nedges = g_col_indices.size();
+
+		//std::cout<<"(n,m):"<<topoData.nvertices
+		//         <<", "<<topoData.nedges<<std::endl;
+
+		topoData.source_offsets = &g_row_offsets[0];
+		topoData.destination_indices = &g_col_indices[0];
+
+		//std::cout<<"********* step 1.3: \n";
+
+		status = nvgraphSetGraphStructure(nvgraph_handle,
+														netx_graph,
+														(void*) &topoData,
+														NVGRAPH_CSR_32);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		//std::cout<<"********* step 2: \n";
+
+		size_t numsets = 1;
+
+		std::vector<float> vdata(topoData.nvertices, 1.);
+		void* vptr[] = { (void*) &vdata[0] };
+		cudaDataType_t type_v[] = { CUDA_R_32F };
+
+		std::vector<float> edata(topoData.nedges, 1.);
+		void* eptr[] = { (void*) &edata[0] };
+		cudaDataType_t type_e[] = { CUDA_R_32F };
+
+		status = nvgraphAllocateVertexData(nvgraph_handle,
+														netx_graph,
+														numsets,
+														type_v);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		//std::cout<<"********* step 3: \n";
+
+		status = nvgraphSetVertexData(nvgraph_handle,
+												netx_graph,
+												(void *) vptr[0],
+												0);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		//std::cout<<"********* step 4: \n";
+
+		status = nvgraphAllocateEdgeData(nvgraph_handle,
+													netx_graph,
+													numsets,
+													type_e);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		//std::cout<<"********* step 5: \n";
+
+		status = nvgraphSetEdgeData(nvgraph_handle,
+												netx_graph,
+												(void *) eptr[0],
+												0);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		//std::cout<<"********* step 6: \n";
+
+		status = nvgraphExtractSubgraphByVertex(nvgraph_handle,
+																netx_graph,
+																extracted_graph,
+																&aggregates[0],
+																aggregates.size());
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		//std::cout<<"********* step 7: \n";
+
+		nvgraphCSRTopology32I_st tData;
+		tData.source_offsets = NULL;
+		tData.destination_indices = NULL;
+
+		//1st time to get nvertices and nedges
+		//
+		status = nvgraphGetGraphStructure(nvgraph_handle, extracted_graph, (void*) &tData, NULL);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		//std::cout<<"********* step 8: \n";
+
+		int cgnv = cg_row_offsets.size() - 1;
+		int cgne = cg_col_indices.size();
+		ASSERT_EQ(tData.nvertices, cgnv);
+		ASSERT_EQ(tData.nedges, cgne);
+
+		//std::cout<<"********* step 9: \n";
+
+		std::vector<int> cgro(cgnv + 1, 0);
+		std::vector<int> cgci(cgne, 0);
+
+		tData.source_offsets = &cgro[0];
+		tData.destination_indices = &cgci[0];
+
+		//2nd time to get row_offsets and column_indices
+		//
+		status = nvgraphGetGraphStructure(nvgraph_handle, extracted_graph, (void*) &tData, NULL);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		//std::cout << "cg row_offsets:\n";
+		//std::copy(cgro.begin(), cgro.end(),
+		//          std::ostream_iterator<int>(std::cout,"\n"));
+
+		//std::cout << "cg col_indices:\n";
+		//std::copy(cgci.begin(), cgci.end(),
+		//          std::ostream_iterator<int>(std::cout,"\n"));
+
+		//PROBLEM: might differ due to different vertex numbering
+		//
+		///ASSERT_EQ(check_diffs(cg_row_offsets, cgro), false);
+		///ASSERT_EQ(check_diffs(cg_col_indices, cgci), false);
+
+		//this is one invariant we can check, besides vector sizes:
+		//
+		ASSERT_EQ(check_delta_invariant(cg_row_offsets, cgro), true);
+
+		//std::cout<<"********* step 10: \n";
+
+		status = nvgraphDestroyGraphDescr(nvgraph_handle, extracted_graph);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+		status = nvgraphDestroyGraphDescr(nvgraph_handle, netx_graph);
+		ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	}
+	catch (const file_read_error& ex)
+	{
+		std::cout << "Exception: " << ex.what() << ", waiving the test\n";
+		const ::testing::TestInfo* const test_info =
+				::testing::UnitTest::GetInstance()->current_test_info();
+		std::cout << "[  WAIVED  ] " << test_info->test_case_name() << "." << test_info->name()
+				<< std::endl;
+		return;
+	}
+	catch (const std::exception& ex)
+	{
+		// dump exception:
+		ASSERT_TRUE(false)<< "Exception: " << ex.what();
+	}
+	catch(...)
+	{
+		ASSERT_TRUE(false) << "Exception: Unknown";
+	}
+}
+
+int main(int argc, char **argv)
+			{
+	::testing::InitGoogleTest(&argc, argv);
+	for (int i = 0; i < argc; i++)
+			{
+		if (strcmp(argv[i], "--ref-data-dir") == 0)
+			ref_data_prefix = std::string(argv[i + 1]);
+		if (strcmp(argv[i], "--graph-data-dir") == 0)
+			graph_data_prefix = std::string(argv[i + 1]);
+	}
+	return RUN_ALL_TESTS();
+}
diff --git a/cpp/nvgraph/cpp/tests/nvgraph_capi_tests_traversal.cpp b/cpp/nvgraph/cpp/tests/nvgraph_capi_tests_traversal.cpp
new file mode 100644
index 00000000000..8309ad8d841
--- /dev/null
+++ b/cpp/nvgraph/cpp/tests/nvgraph_capi_tests_traversal.cpp
@@ -0,0 +1,725 @@
+// This is gtest application that contains all of the C API tests. Parameters:
+// nvgraph_capi_tests [--perf] [--stress-iters N] [--gtest_filter=NameFilterPatter]
+// It also accepts any other gtest (1.7.0) default parameters.
+// Right now this application contains:
+// 1) Sanity Check tests - tests on simple examples with known answer (or known behaviour)
+// 2) Correctness checks tests - tests on real graph data, uses reference algorithm 
+//    (CPU code for SrSPMV and python scripts for other algorithms, see 
+//    python scripts here: //sw/gpgpu/nvgraph/test/ref/) with reference results, compares those two.
+//    It also measures performance of single algorithm C API call, enf enabled (see below)
+// 3) Corner cases tests - tests with some bad inputs, bad parameters, expects library to handle 
+//    it gracefully
+// 4) Stress tests - makes sure that library result is persistent throughout the library usage
+//    (a lot of C API calls). Also makes some assumptions and checks on memory usage during 
+//    this test.
+//
+// We can control what tests to launch by using gtest filters. For example:
+// Only sanity tests:
+//    ./nvgraph_capi_tests_traversal --gtest_filter=*Sanity*
+// And, correspondingly:
+//    ./nvgraph_capi_tests_traversal --gtest_filter=*Correctness*
+//    ./nvgraph_capi_tests_traversal --gtest_filter=*Corner*
+//    ./nvgraph_capi_tests_traversal --gtest_filter=*Stress*
+// Or, combination:
+//    ./nvgraph_capi_tests_traversal --gtest_filter=*Sanity*:*Correctness*
+//
+// Performance reports are provided in the ERIS format and disabled by default. 
+// Could be enabled by adding '--perf' to the command line. I added this parameter to vlct
+//
+// Parameter '--stress-iters N', which gives multiplier (not an absolute value) for the number of launches for stress tests
+//
+
+#include <utility>
+
+#include "gtest/gtest.h"
+
+#include "nvgraph_test_common.h"
+
+#include "valued_csr_graph.hxx"
+#include "readMatrix.hxx"
+#include "nvgraphP.h"
+#include "nvgraph.h"
+#include <nvgraph_experimental.h>  // experimental header, contains hidden API entries, can be shared only under special circumstances without reveling internal things
+
+#include "stdlib.h"
+#include <algorithm>
+#include <numeric>
+#include <queue>
+
+// do the perf measurements, enabled by command line parameter '--perf'
+static int PERF = 0;
+
+// minimum vertices in the graph to perform perf measurements
+#define PERF_ROWS_LIMIT 10000
+
+// number of repeats = multiplier/num_vertices
+#define Traversal_ITER_MULTIPLIER     30000000
+
+template <typename T>
+struct nvgraph_Const;
+
+template <>
+struct nvgraph_Const<int>
+{ 
+    static const cudaDataType_t Type = CUDA_R_32I;
+    static const int inf;
+};
+const int nvgraph_Const<int>::inf = INT_MAX;
+
+static std::string ref_data_prefix = "";
+static std::string graph_data_prefix = "";
+
+// iterations for stress tests = this multiplier * iterations for perf tests
+static int STRESS_MULTIPLIER = 10;
+
+bool enough_device_memory(int n, int nnz, size_t add)
+{
+    size_t mtotal, mfree;
+    cudaMemGetInfo(&mfree, &mtotal);
+    if (mfree > add + sizeof(int)*(4*n)) //graph + pred + distances + 2n (working data) 
+        return true;
+    return false;
+}
+
+std::string convert_to_local_path(const std::string& in_file)
+{
+    std::string wstr = in_file;
+    if ((wstr != "dummy") & (wstr != ""))
+    {
+        std::string prefix;
+        if (graph_data_prefix.length() > 0)
+        {
+            prefix = graph_data_prefix;
+        }
+        else 
+        {
+#ifdef _WIN32
+            //prefix = "C:\\mnt\\eris\\test\\matrices_collection\\";
+            prefix = "Z:\\matrices_collection\\";
+            std::replace(wstr.begin(), wstr.end(), '/', '\\');
+#else
+            prefix = "/mnt/nvgraph_test_data/";
+#endif
+        }
+        wstr = prefix + wstr;
+    }
+    return wstr;
+}
+
+
+
+
+
+void ref_bfs(int n, int nnz, int *rowPtr, int *colInd, int *mask, int source_vertex, int *distances) {
+	for(int i=0; i!=n; ++i)
+		distances[i] = INT_MAX;
+
+	std::queue<int> q;
+	q.push(source_vertex);
+	distances[source_vertex] = 0;
+	
+	while(!q.empty()) {
+		int u = q.front();
+		q.pop();
+
+		for(int iCol = rowPtr[u]; iCol != rowPtr[u+1]; ++iCol) {
+			if(mask && !mask[iCol]) continue;
+			int v = colInd[iCol];
+			if(distances[v] == INT_MAX) { //undiscovered 
+				distances[v] = distances[u] + 1;
+				q.push(v);
+			}
+		} 
+
+	}
+}
+
+typedef struct Traversal_Usecase_t
+{
+    std::string graph_file;
+    int source_vert;
+    bool useMask;
+    bool undirected;
+
+    Traversal_Usecase_t(const std::string& a, int b, bool _useMask=false, bool _undirected=false) : source_vert(b), useMask(_useMask), undirected(_undirected) {
+ 	graph_file = convert_to_local_path(a);
+    };
+
+    Traversal_Usecase_t& operator=(const Traversal_Usecase_t& rhs)
+    {
+        graph_file = rhs.graph_file;
+        source_vert = rhs.source_vert; 
+	useMask = rhs.useMask;  
+        return *this;
+    } 
+} Traversal_Usecase;
+
+
+//// Traversal tests
+
+class NVGraphCAPITests_Traversal : public ::testing::TestWithParam<Traversal_Usecase> {
+  public:
+    NVGraphCAPITests_Traversal() : handle(NULL) {}
+
+    static void SetupTestCase() {}
+    static void TearDownTestCase() {}
+    virtual void SetUp() {
+        if (handle == NULL) {
+            status = nvgraphCreate(&handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+    }
+    virtual void TearDown() {
+        if (handle != NULL) {
+            status = nvgraphDestroy(handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            handle = NULL;
+        }
+    }
+    nvgraphStatus_t status;
+    nvgraphHandle_t handle;
+
+    template <typename EdgeT>
+    void run_current_test(const Traversal_Usecase& param)
+    {
+        const ::testing::TestInfo* const test_info =::testing::UnitTest::GetInstance()->current_test_info();
+        std::stringstream ss; 
+        ss << param.source_vert;
+        std::string test_id = std::string(test_info->test_case_name()) + std::string(".") + std::string(test_info->name()) + std::string("_") + getFileName(param.graph_file) + std::string("_") + ss.str().c_str();
+
+        nvgraphTopologyType_t topo = NVGRAPH_CSR_32;
+
+        nvgraphStatus_t status;
+
+        FILE* fpin = fopen(param.graph_file.c_str(),"rb");
+        ASSERT_TRUE(fpin != NULL) << "Cannot read input graph file: " << param.graph_file << std::endl;
+        int n, nnz;
+        //Read a network in amgx binary format 
+        ASSERT_EQ(read_header_amgx_csr_bin (fpin, n, nnz), 0);
+        std::vector<int> read_row_ptr(n+1), read_col_ind(nnz);
+        std::vector<EdgeT> csr_read_val(nnz);
+        ASSERT_EQ(read_data_amgx_csr_bin (fpin, n, nnz, read_row_ptr, read_col_ind, csr_read_val), 0);
+        fclose(fpin);
+	
+	std::vector<int> csr_mask(nnz, 1);	
+
+	if(param.useMask) {
+		//Generating a mask
+		//Should be improved
+		for(int i=0; i < nnz; i += 2)
+			csr_mask[i] = 0;	
+	}
+
+
+        if (!enough_device_memory(n, nnz, sizeof(int)*(read_row_ptr.size() + read_col_ind.size())))
+        {
+            std::cout << "[  WAIVED  ] " << test_info->test_case_name() << "." << test_info->name() << std::endl;
+            return;
+        }
+		
+        nvgraphGraphDescr_t g1 = NULL;
+        status = nvgraphCreateGraphDescr(handle, &g1);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // set up graph
+        nvgraphCSRTopology32I_st topology = {n, nnz, &read_row_ptr[0], &read_col_ind[0]};
+        status = nvgraphSetGraphStructure(handle, g1, (void*)&topology, topo);
+
+        // set up graph data
+        size_t numsets_v = 2, numsets_e = param.useMask ? 1 : 0;
+        std::vector<int> calculated_distances_res(n);
+        std::vector<int> calculated_predecessors_res(n);
+        //void*  vertexptr[1] = {(void*)&calculated_res[0]};
+        cudaDataType_t type_v[2] = {nvgraph_Const<int>::Type, nvgraph_Const<int>::Type};
+       	cudaDataType_t type_e[1] = {nvgraph_Const<int>::Type};
+ 
+        status = nvgraphAllocateVertexData(handle, g1, numsets_v, type_v);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+       
+	if(param.useMask) {
+		status = nvgraphAllocateEdgeData(handle, g1, numsets_e, type_e);
+        	ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	}
+
+	int source_vert = param.source_vert;
+	nvgraphTraversalParameter_t traversal_param;
+	nvgraphTraversalParameterInit(&traversal_param);
+	nvgraphTraversalSetDistancesIndex(&traversal_param, 0);
+	nvgraphTraversalSetPredecessorsIndex(&traversal_param, 1);
+	nvgraphTraversalSetUndirectedFlag(&traversal_param, param.undirected);
+
+	if(param.useMask) {
+		//if we need to use a mask
+		//Copying mask into graph
+			
+		status = nvgraphSetEdgeData(handle, g1, &csr_mask[0], 0);
+        	ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+		nvgraphTraversalSetEdgeMaskIndex(&traversal_param, 0);
+	}
+	
+        status = nvgraphTraversal(handle, g1, NVGRAPH_TRAVERSAL_BFS, &source_vert, traversal_param);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        cudaDeviceSynchronize();
+    
+        if (PERF && n > PERF_ROWS_LIMIT)
+        {
+            double start, stop;
+            start = second();
+            int repeat = 30;
+            for (int i = 0; i < repeat; i++)
+            {
+                status = nvgraphTraversal(handle, g1, NVGRAPH_TRAVERSAL_BFS, &source_vert, traversal_param);
+                ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            }
+            cudaDeviceSynchronize();
+            stop = second();
+            printf("&&&& PERF Time_%s %10.8f -ms\n", test_id.c_str(), 1000.0*(stop-start)/repeat);
+        }
+
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // get result
+        status = nvgraphGetVertexData(handle, g1, (void *)&calculated_distances_res[0], 0);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        status = nvgraphGetVertexData(handle, g1, (void *)&calculated_predecessors_res[0], 1);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // check with reference
+        std::vector<int> expected_distances_res(n);
+        ref_bfs(n, nnz, &read_row_ptr[0], &read_col_ind[0], &csr_mask[0], source_vert, &expected_distances_res[0]); 
+	//Checking distances
+        for (int i = 0; i < n; ++i)
+        {
+        	ASSERT_EQ(expected_distances_res[i], calculated_distances_res[i]) << "Wrong distance from source in row #" << i << " graph " << param.graph_file << " source_vert=" << source_vert<< "\n" ;
+        }
+
+	//Checking predecessors
+	for (int i = 0; i < n; ++i) {
+		if(calculated_predecessors_res[i] != -1) {
+			ASSERT_EQ(expected_distances_res[i], expected_distances_res[calculated_predecessors_res[i]] + 1) << "Wrong predecessor in row #" << i << " graph " << param.graph_file << " source_vert=" << source_vert<< "\n" ;
+		} else {
+			ASSERT_TRUE(expected_distances_res[i] == 0 || expected_distances_res[i] == INT_MAX) << "Wrong predecessor in row #" << i << " graph " << param.graph_file << " source_vert=" << source_vert<< "\n" ;
+
+		}
+	}
+
+        status = nvgraphDestroyGraphDescr(handle, g1);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    }
+};
+ 
+TEST_P(NVGraphCAPITests_Traversal, CheckResult)
+{
+    run_current_test<float>(GetParam());   
+}
+
+/// Few sanity checks. 
+
+class NVGraphCAPITests_Traversal_Sanity : public ::testing::Test {
+  public:
+    nvgraphStatus_t status;
+    nvgraphHandle_t handle;
+    nvgraphTopologyType_t topo;
+    int n;
+    int nnz;
+    nvgraphGraphDescr_t g1;
+
+    NVGraphCAPITests_Traversal_Sanity() : handle(NULL) {}
+
+    static void SetupTestCase() {}
+    static void TearDownTestCase() {}
+    virtual void SetUp() {
+        topo = NVGRAPH_CSR_32;
+        nvgraphStatus_t status;
+        if (handle == NULL) {
+            status = nvgraphCreate(&handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+    }
+    virtual void TearDown() {
+        if (handle != NULL) {
+            status = nvgraphDestroy(handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            handle = NULL;
+        }
+    }
+    
+    template <typename EdgeT>
+    void prepare_and_run(const nvgraphCSRTopology32I_st& topo_st, int* expected )
+    {
+        g1 = NULL;
+        status = nvgraphCreateGraphDescr(handle, &g1);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // set up graph
+        n = topo_st.nvertices;
+        nnz = topo_st.nedges;
+        status = nvgraphSetGraphStructure(handle, g1, (void*)&topo_st, topo);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        cudaDataType_t type_v[1] = {nvgraph_Const<int>::Type};
+        status = nvgraphAllocateVertexData(handle, g1, 1, type_v );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        int source_vert = 0;
+        int traversal_distances_index = 0;
+
+	nvgraphTraversalParameter_t traversal_param;
+	nvgraphTraversalParameterInit(&traversal_param);
+	nvgraphTraversalSetDistancesIndex(&traversal_param, traversal_distances_index);
+
+        status = nvgraphTraversal(handle, g1, NVGRAPH_TRAVERSAL_BFS, &source_vert, traversal_param);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // get result
+        std::vector<int> calculated_res(n);
+        status = nvgraphGetVertexData(handle, g1, (void *)&calculated_res[0], traversal_distances_index);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        for (int row = 0; row < n; row++)
+        {
+            int reference_res = (int)expected[row];
+            int nvgraph_res = (int)calculated_res[row];
+            ASSERT_EQ(reference_res, nvgraph_res);
+        }
+
+        status = nvgraphDestroyGraphDescr(handle, g1);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    }
+
+// cycle graph, shortest path = vertex number
+    template <typename EdgeT>
+    void run_cycle_test()
+    {
+        n = 1024;
+        nnz = n;
+        std::vector<int> offsets(n+1), neighborhood(n);
+        for (int i = 0; i < n; i++)
+        {
+            offsets[i] = i;
+            neighborhood[i] = (i + 1) % n;
+        }
+        offsets[n] = n;
+        std::vector<int> expected_res(n, nvgraph_Const<int>::inf);
+        for (int i = 0; i < n; i++)
+        {
+            expected_res[i] = i;
+        }
+
+        nvgraphCSRTopology32I_st topology = {n, nnz, &offsets[0], &neighborhood[0]};
+        
+        prepare_and_run<EdgeT>(topology, &expected_res[0]);
+    }
+
+};
+ 
+TEST_F(NVGraphCAPITests_Traversal_Sanity, SanityCycle)
+{
+    run_cycle_test<float>();
+}
+
+class NVGraphCAPITests_Traversal_CornerCases : public ::testing::Test {
+  public:
+    nvgraphStatus_t status;
+    nvgraphHandle_t handle;
+    nvgraphTopologyType_t topo;
+    int n;
+    int nnz;
+    nvgraphGraphDescr_t g1;
+
+    NVGraphCAPITests_Traversal_CornerCases() : handle(NULL) {}
+
+    static void SetupTestCase() {}
+    static void TearDownTestCase() {}
+    virtual void SetUp() {
+        topo = NVGRAPH_CSR_32;
+        nvgraphStatus_t status;
+        if (handle == NULL) {
+            status = nvgraphCreate(&handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+    }
+    virtual void TearDown() {
+        if (handle != NULL) {
+            status = nvgraphDestroy(handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            handle = NULL;
+        }
+    }
+
+    template <typename EdgeT>
+    void run_cycle_test()
+    {
+        n = 1024;
+        nnz = n;
+        std::vector<int> offsets(n+1), neighborhood(n);
+        for (int i = 0; i < n; i++)
+        {
+            offsets[i] = i;
+            neighborhood[i] = (i + 1) % n;
+        }
+        offsets[n] = n;
+
+        nvgraphCSRTopology32I_st topology = {n, nnz, &offsets[0], &neighborhood[0]};
+        
+        int source_vert = 0;
+        int traversal_distances_index = 0;
+        
+        g1 = NULL;
+        status = nvgraphCreateGraphDescr(handle, &g1);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // set up graph
+        status = nvgraphSetGraphStructure(handle, g1, (void*)&topology, topo);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // only multivaluedCSR are supported
+	nvgraphTraversalParameter_t traversal_param;
+        nvgraphTraversalParameterInit(&traversal_param);
+        nvgraphTraversalSetDistancesIndex(&traversal_param, traversal_distances_index);
+	
+	status = nvgraphTraversal(handle, g1, NVGRAPH_TRAVERSAL_BFS, &source_vert, traversal_param);
+        ASSERT_NE(NVGRAPH_STATUS_SUCCESS, status);
+
+        cudaDataType_t type_v[1] = {nvgraph_Const<int>::Type};
+        status = nvgraphAllocateVertexData(handle, g1, 1, type_v );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        status = nvgraphTraversal(NULL, g1, NVGRAPH_TRAVERSAL_BFS, &source_vert, traversal_param);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+       	
+	status = nvgraphTraversal(handle, NULL, NVGRAPH_TRAVERSAL_BFS, &source_vert, traversal_param);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+
+      	status = nvgraphTraversal(handle, g1, NVGRAPH_TRAVERSAL_BFS, NULL, traversal_param);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+
+        status = nvgraphDestroyGraphDescr(handle, g1);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // only CSR is supported
+        {
+            status = nvgraphCreateGraphDescr(handle, &g1);  
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphSetGraphStructure(handle, g1, (void*)&topology, NVGRAPH_CSC_32);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphAllocateVertexData(handle, g1, 1, type_v );
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+	    
+	    nvgraphTraversalParameterInit(&traversal_param);
+            nvgraphTraversalSetDistancesIndex(&traversal_param, traversal_distances_index);
+	
+            status = nvgraphTraversal(handle, g1, NVGRAPH_TRAVERSAL_BFS, &source_vert, traversal_param);
+            ASSERT_NE(NVGRAPH_STATUS_SUCCESS, status);
+            status = nvgraphDestroyGraphDescr(handle, g1);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+
+        }
+};
+ 
+TEST_F(NVGraphCAPITests_Traversal_CornerCases, CornerCases)
+{
+    run_cycle_test<float>();
+}
+
+class NVGraphCAPITests_Traversal_Stress : public ::testing::TestWithParam<Traversal_Usecase> {
+  public:
+    NVGraphCAPITests_Traversal_Stress() : handle(NULL) {}
+
+    static void SetupTestCase() {}
+    static void TearDownTestCase() {}
+    virtual void SetUp() {
+        //const ::testing::TestInfo* const test_info =::testing::UnitTest::GetInstance()->current_test_info();
+        //printf("We are in test %s of test case %s.\n", test_info->name(), test_info->test_case_name());
+        if (handle == NULL) {
+            status = nvgraphCreate(&handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+    }
+    virtual void TearDown() {
+        if (handle != NULL) {
+            status = nvgraphDestroy(handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            handle = NULL;
+        }
+    }
+    nvgraphStatus_t status;
+    nvgraphHandle_t handle;
+
+    template <typename EdgeT>
+    void run_current_test(const Traversal_Usecase& param)
+    {
+        nvgraphTopologyType_t topo = NVGRAPH_CSR_32;
+
+        nvgraphStatus_t status;
+
+        FILE* fpin = fopen(param.graph_file.c_str(),"rb");
+        ASSERT_TRUE(fpin != NULL) << "Cannot read input graph file: " << param.graph_file << std::endl;
+        int n, nnz;
+        //Read a network in amgx binary format and the bookmark of dangling nodes
+        ASSERT_EQ(read_header_amgx_csr_bin (fpin, n, nnz), 0);
+        std::vector<int> read_row_ptr(n+1), read_col_ind(nnz);
+        std::vector<EdgeT> read_val(nnz);
+        ASSERT_EQ(read_data_amgx_csr_bin (fpin, n, nnz, read_row_ptr, read_col_ind, read_val), 0);
+        fclose(fpin);
+
+        nvgraphGraphDescr_t g1 = NULL;
+        status = nvgraphCreateGraphDescr(handle, &g1);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // set up graph
+        nvgraphCSRTopology32I_st topology = {n, nnz, &read_row_ptr[0], &read_col_ind[0]};
+        status = nvgraphSetGraphStructure(handle, g1, (void*)&topology, topo);
+
+        std::vector<int> calculated_res(n);
+        // set up graph data
+        //size_t numsets = 1;
+        //cudaDataType_t type_v[1] = {nvgraph_Const<int>::Type};
+        size_t numsets = 2;
+        cudaDataType_t type_v[2] = {nvgraph_Const<int>::Type, nvgraph_Const<int>::Type};
+        
+        status = nvgraphAllocateVertexData(handle, g1, numsets, type_v);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        
+        int source_vert = param.source_vert;
+        int traversal_distances_index = 0;
+        int traversal_predecessors_index = 1;
+
+        // run
+        int repeat = 2;//std::max((int)(((float)(Traversal_ITER_MULTIPLIER)*STRESS_MULTIPLIER)/(3*n)), 1);
+
+        std::vector<int> calculated_res1(n), calculated_res_mid1(n), calculated_res_last(n);
+        std::vector<int> calculated_res2(n), calculated_res_mid2(n);
+        size_t free_mid = 0, free_last = 0, total = 0;      
+        for (int i = 0; i < repeat; i++)
+        {
+            nvgraphTraversalParameter_t traversal_param;
+            nvgraphTraversalParameterInit(&traversal_param);
+            nvgraphTraversalSetPredecessorsIndex(&traversal_param, 1);
+            nvgraphTraversalSetUndirectedFlag(&traversal_param, param.undirected);    
+            nvgraphTraversalSetDistancesIndex(&traversal_param, traversal_distances_index);
+            status = nvgraphTraversal(handle, g1, NVGRAPH_TRAVERSAL_BFS, &source_vert, traversal_param);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+            // all of those should be equal
+            if (i == 0)
+            {
+                status = nvgraphGetVertexData(handle, g1, (void *)&calculated_res1[0], traversal_distances_index);
+                ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+                status = nvgraphGetVertexData(handle, g1, (void *)&calculated_res2[0], traversal_predecessors_index);
+                ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            }
+            else
+            {
+                status = nvgraphGetVertexData(handle, g1, (void *)&calculated_res_mid1[0], traversal_distances_index);
+                ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+                status = nvgraphGetVertexData(handle, g1, (void *)&calculated_res_mid2[0], traversal_predecessors_index);
+                ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+                for (int row = 0; row < n; row++)
+                {
+                    ASSERT_EQ(calculated_res1[row], calculated_res_mid1[row]) << "Difference in result in distances for row #" << row << " graph " << param.graph_file << " for iterations #0 and iteration #" <<  i;
+                    // predecessors could be different since multiple shortest paths are possible
+                    //ASSERT_EQ(calculated_res2[row], calculated_res_mid2[row]) << "Difference in result in predecessors for row #" << row << " graph " << param.graph_file << " for iterations #0 and iteration #" <<  i;
+                }
+            }
+
+            if (i == std::min(50, (int)(repeat/2)))
+            {
+                cudaMemGetInfo(&free_mid, &total);
+            }
+            if (i == repeat-1)
+            {
+                status = nvgraphGetVertexData(handle, g1, (void *)&calculated_res_last[0], traversal_distances_index);
+                ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+                cudaMemGetInfo(&free_last, &total);
+            }
+        }
+
+        ASSERT_LE(free_mid, free_last) << "Memory difference between iteration #" << std::min(50, (int)(repeat/2)) << " and last iteration is " << (double)(free_last-free_mid)/1e+6 << "MB";
+
+        status = nvgraphDestroyGraphDescr(handle, g1);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    }
+};
+ 
+TEST_P(NVGraphCAPITests_Traversal_Stress, Stress)
+{
+    run_current_test<float>(GetParam());
+}
+
+// instatiation of the performance/correctness checks 
+
+
+INSTANTIATE_TEST_CASE_P(CorrectnessCheck,
+                        NVGraphCAPITests_Traversal,
+                        //                                  graph FILE                                                  source vert #    file with expected result (in binary?)
+                        ::testing::Values(    
+                                              Traversal_Usecase("graphs/cage/cage13_T.mtx.bin", 0)
+                                              , Traversal_Usecase("graphs/cage/cage13_T.mtx.bin", 10)
+                                              , Traversal_Usecase("graphs/cage/cage14_T.mtx.bin", 0)
+                                              , Traversal_Usecase("graphs/cage/cage14_T.mtx.bin", 10)
+                                              , Traversal_Usecase("graphs/small/small.bin", 0)
+                                              , Traversal_Usecase("graphs/small/small.bin", 0)
+                                              , Traversal_Usecase("graphs/small/small.bin", 3)
+                                              , Traversal_Usecase("graphs/dblp/dblp.bin", 0, false, true)
+                                              , Traversal_Usecase("graphs/dblp/dblp.bin", 100, false, true)
+                                              , Traversal_Usecase("graphs/dblp/dblp.bin", 1000, false, true)
+                                              , Traversal_Usecase("graphs/dblp/dblp.bin", 100000, false, true)
+                                              , Traversal_Usecase("graphs/Wikipedia/2003/wiki2003.bin", 0)
+                                              , Traversal_Usecase("graphs/Wikipedia/2003/wiki2003.bin", 100)
+                                              , Traversal_Usecase("graphs/Wikipedia/2003/wiki2003.bin", 10000)
+                                              , Traversal_Usecase("graphs/Wikipedia/2003/wiki2003.bin", 100000)
+                                              , Traversal_Usecase("graphs/Wikipedia/2011/wiki2011.bin", 1)
+                                              , Traversal_Usecase("graphs/Wikipedia/2011/wiki2011.bin", 1000)
+                                              //, Traversal_Usecase("graphs/citPatents/cit-Patents_T.mtx.bin", 6543, "")
+                                              //, Traversal_Usecase("dimacs10/kron_g500-logn20_T.mtx.bin", 100000, "")
+                                              //, Traversal_Usecase("dimacs10/hugetrace-00020_T.mtx.bin", 100000, "")
+                                              //, Traversal_Usecase("dimacs10/delaunay_n24_T.mtx.bin", 100000, "")
+                                              , Traversal_Usecase("dimacs10/road_usa_T.mtx.bin", 100)
+                                              , Traversal_Usecase("graphs/Twitter/twitter.bin", 0)
+                                              , Traversal_Usecase("graphs/Twitter/twitter.bin", 100)
+                                              , Traversal_Usecase("graphs/Twitter/twitter.bin", 10000)
+                                              , Traversal_Usecase("graphs/Twitter/twitter.bin", 3000000)
+                                              //, Traversal_Usecase("dimacs10/hugebubbles-00020_T.mtx.bin", 100000)
+                                            ///// instances using mask
+					      , Traversal_Usecase("graphs/small/small.bin", 0, true)
+                                              , Traversal_Usecase("graphs/small/small.bin", 0, true)
+                                              , Traversal_Usecase("graphs/small/small.bin", 3, true)
+                                              , Traversal_Usecase("graphs/dblp/dblp.bin", 0, true)
+                                              , Traversal_Usecase("graphs/dblp/dblp.bin", 100, true)
+                                              , Traversal_Usecase("graphs/dblp/dblp.bin", 1000, true)
+                                              , Traversal_Usecase("graphs/dblp/dblp.bin", 100000, true)
+                                              , Traversal_Usecase("graphs/Wikipedia/2003/wiki2003.bin", 0, true)
+                                         )
+                    
+		    );
+
+INSTANTIATE_TEST_CASE_P(StressTest,
+                        NVGraphCAPITests_Traversal_Stress,
+                        ::testing::Values(
+                                                Traversal_Usecase("graphs/Wikipedia/2003/wiki2003.bin", 0)
+                                            )
+                        );
+
+
+int main(int argc, char **argv) 
+{
+
+    for (int i = 0; i < argc; i++)
+    {
+        if (strcmp(argv[i], "--perf") == 0)
+            PERF = 1;
+        if (strcmp(argv[i], "--stress-iters") == 0)
+            STRESS_MULTIPLIER = atoi(argv[i+1]);
+        if (strcmp(argv[i], "--ref-data-dir") == 0)
+            ref_data_prefix = std::string(argv[i+1]);
+        if (strcmp(argv[i], "--graph-data-dir") == 0)
+            graph_data_prefix = std::string(argv[i+1]);
+    }
+    srand(42);
+    ::testing::InitGoogleTest(&argc, argv);
+        
+  return RUN_ALL_TESTS();
+}
diff --git a/cpp/nvgraph/cpp/tests/nvgraph_capi_tests_triangles.cpp b/cpp/nvgraph/cpp/tests/nvgraph_capi_tests_triangles.cpp
new file mode 100644
index 00000000000..d68affc17c5
--- /dev/null
+++ b/cpp/nvgraph/cpp/tests/nvgraph_capi_tests_triangles.cpp
@@ -0,0 +1,377 @@
+// This is gtest application that contains all of the C API tests. Parameters:
+// nvgraph_capi_tests [--perf] [--stress-iters N] [--gtest_filter=NameFilterPatter]
+// It also accepts any other gtest (1.7.0) default parameters.
+// Right now this application contains:
+// 1) Sanity Check tests - tests on simple examples with known answer (or known behaviour)
+// 2) Correctness checks tests - tests on real graph data, uses reference algorithm 
+//    (CPU code for SrSPMV and python scripts for other algorithms, see 
+//    python scripts here: //sw/gpgpu/nvgraph/test/ref/) with reference results, compares those two.
+//    It also measures performance of single algorithm C API call, enf enabled (see below)
+// 3) Corner cases tests - tests with some bad inputs, bad parameters, expects library to handle 
+//    it gracefully
+// 4) Stress tests - makes sure that library result is persistent throughout the library usage
+//    (a lot of C API calls). Also makes some assumptions and checks on memory usage during 
+//    this test.
+//
+// We can control what tests to launch by using gtest filters. For example:
+// Only sanity tests:
+//    ./nvgraph_capi_tests --gtest_filter=*Sanity*
+// And, correspondingly:
+//    ./nvgraph_capi_tests --gtest_filter=*Correctness*
+//    ./nvgraph_capi_tests --gtest_filter=*Corner*
+//    ./nvgraph_capi_tests --gtest_filter=*Stress*
+// Or, combination:
+//    ./nvgraph_capi_tests --gtest_filter=*Sanity*:*Correctness*
+//
+// Performance reports are provided in the ERIS format and disabled by default. 
+// Could be enabled by adding '--perf' to the command line. I added this parameter to vlct
+//
+// Parameter '--stress-iters N', which gives multiplier (not an absolute value) for the number of launches for stress tests
+//
+
+#include <utility>
+
+#include "gtest/gtest.h"
+
+#include "nvgraph_test_common.h"
+
+#include "valued_csr_graph.hxx"
+#include "readMatrix.hxx"
+#include "nvgraphP.h"
+#include "nvgraph.h"
+#include <nvgraph_experimental.h>  // experimental header, contains hidden API entries, can be shared only under special circumstances without reveling internal things
+
+#include "stdlib.h"
+#include "stdint.h"
+#include <algorithm>
+
+// do the perf measurements, enabled by command line parameter '--perf'
+static int PERF = 0;
+
+// minimum vertices in the graph to perform perf measurements
+#define PERF_ROWS_LIMIT 10000
+static int complex_repeats = 20;
+static std::string ref_data_prefix = "";
+static std::string graph_data_prefix = "";
+
+template <typename T>
+struct comparison
+{
+    bool operator() (T* lhs, T* rhs) {return (*lhs) < (*rhs);}
+};
+
+
+template <typename T>
+bool enough_device_memory(int n, int nnz, size_t add)
+{
+    size_t mtotal, mfree;
+    cudaMemGetInfo(&mfree, &mtotal);
+    if (mfree > add + sizeof(T)*3*(n + nnz)) 
+        return true;
+    return false;
+}
+
+std::string convert_to_local_path(const std::string& in_file)
+{
+    std::string wstr = in_file;
+    if ((wstr != "dummy") & (wstr != ""))
+    {
+        std::string prefix;
+        if (graph_data_prefix.length() > 0)
+        {
+            prefix = graph_data_prefix;
+        }
+        else 
+        {
+#ifdef _WIN32
+            //prefix = "C:\\mnt\\eris\\test\\matrices_collection\\";
+            prefix = "\\\\cuda-vnetapp\\eris_matrices\\";
+            std::replace(wstr.begin(), wstr.end(), '/', '\\');
+#else
+            prefix = "/mnt/nvgraph_test_data/";
+#endif
+        }
+        wstr = prefix + wstr;
+    }
+    return wstr;
+}
+
+class NVGraphCAPITests_Triangles_Sanity : public ::testing::Test {
+  public:
+    nvgraphStatus_t status;
+    nvgraphHandle_t handle;
+    nvgraphTopologyType_t topo;
+    nvgraphGraphDescr_t g1;
+
+    NVGraphCAPITests_Triangles_Sanity() : handle(NULL) {}
+
+    static void SetupTestCase() {}
+    static void TearDownTestCase() {}
+    virtual void SetUp() {
+        nvgraphStatus_t status;
+        if (handle == NULL) {
+            status = nvgraphCreate(&handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+    }
+    virtual void TearDown() {
+        if (handle != NULL) {
+            status = nvgraphDestroy(handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            handle = NULL;
+        }
+    }
+    
+
+    void prepare_and_run(const void* topo_st, bool lower_triangular, uint64_t expected )
+    {
+        g1 = NULL;
+        status = nvgraphCreateGraphDescr(handle, &g1);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // set up graph
+        status = nvgraphSetGraphStructure(handle, g1, (void*)topo_st, topo);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        uint64_t res = 0;
+
+        status = nvgraphTriangleCount(handle, g1, &res);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        //printf("Expected triangles: %" PRIu64 ", got triangles: %" PRIu64 "\n", expected, res);
+
+        // get result
+        ASSERT_EQ(expected, res);
+
+        status = nvgraphDestroyGraphDescr(handle, g1);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    }
+
+    void run_star_test_csr()
+    {
+        int N = 1024; // min is 5
+        int n = N - 1; 
+        int nnz = 2 * (N - 1) ;
+        std::vector<int> offsets(N+1), neighborhood(nnz);
+        offsets[0] = 0; offsets[1] = 0;
+        int cur_nnz = 0;
+        for (int i = 1; i < N; i++)
+        {
+            for (int j = 0; j < i; j++)
+            {
+                if (j == 0 || j == i - 1 || (j == 1 && i == (N-1)))
+                {
+                    neighborhood[cur_nnz] = j;
+                    cur_nnz++;
+                }
+            }
+            offsets[i+1] = cur_nnz;
+        }
+        //offsets[n] = cur_nnz;
+        /*printf("N = %d, n = %d, nnz = %d\n", N, n, nnz);
+        for (int i = 0; i < N+1; i++)
+            printf("RO [%d] == %d\n", i, offsets[i]);
+
+        for (int i = 0; i < nnz; i++)
+            printf("CI [%d] == %d\n", i, neighborhood[i]);*/
+
+        topo = NVGRAPH_CSR_32;
+
+        nvgraphCSRTopology32I_st topology = {N, nnz, &offsets[0], &neighborhood[0]};
+        
+        prepare_and_run((void*)&topology, true, n);
+    }
+
+    void run_seq_test_csr()
+    {
+        int N = 1024; // min is 3
+        int n = N - 2; // actual number of triangles
+        int nnz = 2 * (N - 3) + 3;
+        std::vector<int> offsets(N+1), neighborhood(nnz);
+        offsets[0] = 0;
+        int cur_nnz = 0;
+        for (int i = 0; i < N; i++)
+        {
+            if (i > 1)
+            {
+                neighborhood[cur_nnz] = i - 2;
+                cur_nnz++;
+            }
+            if (i > 0)
+            {
+                neighborhood[cur_nnz] = i - 1;
+                cur_nnz++;
+            }
+            offsets[i+1] = cur_nnz;
+        }
+        //offsets[n] = cur_nnz;
+        /*printf("N = %d, n = %d, nnz = %d\n", N, n, nnz);
+        for (int i = 0; i < N+1; i++)
+            printf("RO [%d] == %d\n", i, offsets[i]);
+
+        for (int i = 0; i < nnz; i++)
+            printf("CI [%d] == %d\n", i, neighborhood[i]);*/
+
+        topo = NVGRAPH_CSR_32;
+
+        nvgraphCSRTopology32I_st topology = {N, nnz, &offsets[0], &neighborhood[0]};
+        
+        prepare_and_run((void*)&topology, true, n);
+    }
+};
+
+typedef struct TriCount_Usecase_t
+{
+    std::string graph_file;
+    uint64_t ref_tricount;
+    TriCount_Usecase_t(const std::string& a, uint64_t b) : ref_tricount(b) { graph_file = convert_to_local_path(a); };
+    TriCount_Usecase_t& operator=(const TriCount_Usecase_t& rhs)
+    {
+        graph_file = rhs.graph_file;
+        ref_tricount = rhs.ref_tricount;
+        return *this;
+    } 
+} TriCount_Usecase_t;
+
+class TriCountRefGraphCheck : public ::testing::TestWithParam<TriCount_Usecase_t> {
+  public:
+    TriCountRefGraphCheck() : handle(NULL) {}
+
+    static void SetupTestCase() {}
+    static void TearDownTestCase() {}
+    virtual void SetUp() {
+        if (handle == NULL) {
+            status = nvgraphCreate(&handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+    }
+    virtual void TearDown() {
+        if (handle != NULL) {
+            status = nvgraphDestroy(handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            handle = NULL;
+        }
+    }
+    nvgraphStatus_t status;
+    nvgraphHandle_t handle;
+
+    void run_current_test(const TriCount_Usecase_t& param)
+    {
+        const ::testing::TestInfo* const test_info =::testing::UnitTest::GetInstance()->current_test_info();
+        std::string test_id = std::string(test_info->test_case_name()) + std::string(".") + std::string(test_info->name()) + std::string("_") + getFileName(param.graph_file);
+
+        nvgraphTopologyType_t topo = NVGRAPH_CSR_32;
+
+        nvgraphStatus_t status;
+
+        FILE* fpin = fopen(param.graph_file.c_str(),"rb");
+        ASSERT_TRUE(fpin != NULL) << "Cannot read input graph file: " << param.graph_file << std::endl;
+        int n, nnz;
+        //Read a transposed network in amgx binary format and the bookmark of dangling nodes
+        std::vector<int> read_row_ptr, read_col_ind;
+        ASSERT_EQ(read_csr_bin (fpin, n, nnz, read_row_ptr, read_col_ind), 0);
+        fclose(fpin);
+
+        if (!enough_device_memory<char>(n, nnz, sizeof(int)*(read_row_ptr.size() + read_col_ind.size())))
+        {
+            std::cout << "[  WAIVED  ] " << test_info->test_case_name() << "." << test_info->name() << std::endl;
+            return;
+        }
+
+        nvgraphGraphDescr_t g1 = NULL;
+        status = nvgraphCreateGraphDescr(handle, &g1);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // set up graph
+        nvgraphCSRTopology32I_st topology = {n, nnz, &read_row_ptr[0], &read_col_ind[0]};
+        status = nvgraphSetGraphStructure(handle, g1, (void*)&topology, topo);
+
+        uint64_t res = 0;
+
+        status = nvgraphTriangleCount(handle, g1, &res);
+        cudaDeviceSynchronize();
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        // run
+        if (PERF && n > PERF_ROWS_LIMIT)
+        {
+            double start, stop;
+            start = second();
+            start = second();
+            int repeat = complex_repeats;
+            for (int i = 0; i < repeat; i++)
+            {
+                status = nvgraphTriangleCount(handle, g1, &res);
+                ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            }
+            cudaDeviceSynchronize();
+            stop = second();
+            printf("&&&& PERF Time_%s %10.8f -ms\n", test_id.c_str(), 1000.0*(stop-start)/repeat);
+        }
+
+        //printf("Expected triangles: %" PRIu64 ", got triangles: %" PRIu64 "\n", expected, res);
+        ASSERT_EQ(param.ref_tricount, res);
+
+        status = nvgraphDestroyGraphDescr(handle, g1);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    }
+};
+ 
+TEST_P(TriCountRefGraphCheck, CorrectnessCheck)
+{
+    run_current_test(GetParam());   
+}
+
+
+INSTANTIATE_TEST_CASE_P(NVGraphCAPITests_TriCount,
+                        TriCountRefGraphCheck,
+                        //                                          graph FILE                              reference number of triangles
+//                                            // we read matrix stored in CSR and pass it as CSC - so matrix is in fact transposed, that's why we compare it to the results calculated on a transposed matrix
+                        ::testing::Values(
+                                                TriCount_Usecase_t("graphs/triangles_counting/as-skitter_internet_topo.csr.bin"       , 28769868)
+                                              , TriCount_Usecase_t("graphs/triangles_counting/cage15_N_5154859.csr.bin"               , 36106416             )
+                                              , TriCount_Usecase_t("graphs/triangles_counting/cit-Patents_N_3774768.csr.bin"          , 7515023)
+                                              , TriCount_Usecase_t("graphs/triangles_counting/coAuthorsCiteseer_N_227320.csr.bin"     , 2713298)
+                                              , TriCount_Usecase_t("graphs/triangles_counting/com-orkut_N_3072441.csr.bin"            , 627584181)
+                                              , TriCount_Usecase_t("graphs/triangles_counting/coPapersCiteseer.csr.bin"               , 872040567)
+                                              , TriCount_Usecase_t("graphs/triangles_counting/coPapersDBLP_N_540486.csr.bin"          , 444095058)
+                                              , TriCount_Usecase_t("graphs/triangles_counting/europe_osm_N_50912018.csr.bin"          , 61710)
+                                              , TriCount_Usecase_t("graphs/triangles_counting/hollywood-2009_N_1139905.csr.bin"       , 4916374555)
+                                              , TriCount_Usecase_t("graphs/triangles_counting/kron_g500-simple-logn16.csr.bin"        , 118811321)
+                                              , TriCount_Usecase_t("graphs/triangles_counting/kron_g500-simple-logn18.csr.bin"        , 687677667)
+                                              , TriCount_Usecase_t("graphs/triangles_counting/kron_g500-simple-logn21.csr.bin"        , 8815649682)
+                                              , TriCount_Usecase_t("graphs/triangles_counting/mouse_gene_N_45101.csr.bin"             , 3619097862)
+                                              , TriCount_Usecase_t("graphs/triangles_counting/road_central_N_14081816.csr.bin"        , 228918)
+                                              , TriCount_Usecase_t("graphs/triangles_counting/soc-LiveJournal1_N_4847571.csr.bin"     , 285730264)
+                                              , TriCount_Usecase_t("graphs/triangles_counting/wb-edu_N_9845725.csr.bin"               , 254718147)
+                                            ///// more instances
+                                         )
+                        );
+ 
+TEST_F(NVGraphCAPITests_Triangles_Sanity, SanityStarCSR)
+{
+    run_star_test_csr();
+}
+
+TEST_F(NVGraphCAPITests_Triangles_Sanity, SanitySeqCSR)
+{
+    run_seq_test_csr();
+}
+
+int main(int argc, char **argv) 
+{
+
+    for (int i = 0; i < argc; i++)
+    {
+        if (strcmp(argv[i], "--perf") == 0)
+            PERF = 1;
+        if (strcmp(argv[i], "--ref-data-dir") == 0)
+            ref_data_prefix = std::string(argv[i+1]);
+        if (strcmp(argv[i], "--graph-data-dir") == 0)
+            graph_data_prefix = std::string(argv[i+1]);
+    }
+    srand(42);
+    ::testing::InitGoogleTest(&argc, argv);
+        
+  return RUN_ALL_TESTS();
+}
diff --git a/cpp/nvgraph/cpp/tests/nvgraph_test.cpp b/cpp/nvgraph/cpp/tests/nvgraph_test.cpp
new file mode 100644
index 00000000000..d10b9f8e7ab
--- /dev/null
+++ b/cpp/nvgraph/cpp/tests/nvgraph_test.cpp
@@ -0,0 +1,976 @@
+#include "gtest/gtest.h"
+#include "valued_csr_graph.hxx"
+#include "nvgraphP.h"
+#include "nvgraph.h"
+#include <cstring>
+class NvgraphAPITest : public ::testing::Test {
+  public:
+    NvgraphAPITest() : handle(NULL) {}
+
+  protected:
+    static void SetupTestCase() {}
+    static void TearDownTestCase() {}
+    virtual void SetUp() {
+        if (handle == NULL) {
+            status = nvgraphCreate(&handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        }
+    }
+    virtual void TearDown() {
+        if (handle != NULL) {
+            status = nvgraphDestroy(handle);
+            ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+            handle = NULL;
+        }
+    }
+    nvgraphStatus_t status;
+    nvgraphHandle_t handle;
+    cudaStream_t *stream;
+};
+
+
+nvgraphCSRTopology32I_st topoData;
+void createTopo()
+{
+//    nvgraphStatus_t mystatus;
+    topoData.nvertices = 4;
+    topoData.nedges = 5;
+    int offsets[6];  //{0,1,3,4,5,5};
+    offsets[0] = 0;
+    offsets[1] = 1;
+    offsets[2] = 3;
+    offsets[3] = 4;
+    offsets[4] = 5;
+    offsets[5] = 5;
+    topoData.source_offsets= offsets;
+
+    int neighborhood[5];
+    neighborhood[0]=0;
+    neighborhood[1]=2;
+    neighborhood[2]=3;
+    neighborhood[3]=4;
+    neighborhood[4]=4;
+
+    topoData.destination_indices = neighborhood;
+
+};
+ 
+TEST_F(NvgraphAPITest,NvgraphCreateDestroy)
+{
+}
+
+
+TEST_F(NvgraphAPITest,NvgraphStatusGetString )
+{
+  
+        const char *ret_status_str;
+        nvgraphStatus_t status = NVGRAPH_STATUS_SUCCESS; 
+        ret_status_str = nvgraphStatusGetString( status);
+        const std::string success_str = "Success";
+    
+        ASSERT_EQ( ret_status_str, success_str);
+
+}
+
+TEST_F(NvgraphAPITest,NvgraphStatusGetStringFailNotInit)
+{
+//        nvgraphStatus_t status;
+        //status = nvgraphDestroy( handle);
+        const std::string not_init_str = "nvGRAPH not initialized";
+        const char *ret_status_str;
+        ret_status_str = nvgraphStatusGetString(NVGRAPH_STATUS_NOT_INITIALIZED); 
+        ASSERT_EQ( ret_status_str, not_init_str);
+}
+
+TEST_F(NvgraphAPITest,NvgraphStatusGetStringFailAllocFailed) 
+{ 
+        const char *ret_status_str;
+        const std::string alloc_failed = "nvGRAPH alloc failed";
+        ret_status_str = nvgraphStatusGetString(NVGRAPH_STATUS_ALLOC_FAILED);
+        ASSERT_EQ( ret_status_str, alloc_failed);
+}
+
+TEST_F(NvgraphAPITest,NvgraphStatusGetStringFailInvalidValue) 
+{ 
+        const char *ret_status_str;
+        const std::string invalid_value = "nvGRAPH invalid value";
+        ret_status_str = nvgraphStatusGetString(NVGRAPH_STATUS_INVALID_VALUE);
+        ASSERT_EQ( ret_status_str, invalid_value);
+}
+
+TEST_F(NvgraphAPITest,NvgraphStatusGetStringFailArchMismatch) 
+{ 
+        const char *ret_status_str;
+        const std::string arch_mismatch = "nvGRAPH arch mismatch";
+        ret_status_str = nvgraphStatusGetString(NVGRAPH_STATUS_ARCH_MISMATCH);
+        ASSERT_EQ( ret_status_str, arch_mismatch);
+}
+
+TEST_F(NvgraphAPITest,NvgraphStatusGetStringFailMappingError) 
+{ 
+        const char *ret_status_str;
+        const std::string mapping_error = "nvGRAPH mapping error";
+        ret_status_str = nvgraphStatusGetString(NVGRAPH_STATUS_MAPPING_ERROR);
+        ASSERT_EQ( ret_status_str, mapping_error);
+}
+
+TEST_F(NvgraphAPITest,NvgraphStatusGetStringFailExecFailed) 
+{ 
+        const char *ret_status_str;
+        const std::string exec_failed = "nvGRAPH execution failed";
+        ret_status_str = nvgraphStatusGetString(NVGRAPH_STATUS_EXECUTION_FAILED);
+        ASSERT_EQ( ret_status_str, exec_failed);
+}
+
+TEST_F(NvgraphAPITest,NvgraphStatusGetStringFailInternalError) 
+{ 
+        const char *ret_status_str;
+        const std::string internal_error = "nvGRAPH internal error";
+        ret_status_str = nvgraphStatusGetString(NVGRAPH_STATUS_INTERNAL_ERROR);
+        ASSERT_EQ( ret_status_str, internal_error);
+}
+
+TEST_F(NvgraphAPITest,NvgraphStatusGetStringFailTypeNotSupported) 
+{ 
+        const char *ret_status_str;
+        const std::string type_not_supported = "nvGRAPH type not supported";
+        ret_status_str = nvgraphStatusGetString(NVGRAPH_STATUS_TYPE_NOT_SUPPORTED);
+        ASSERT_EQ( ret_status_str, type_not_supported);
+}
+
+TEST_F(NvgraphAPITest,NvgraphStatusGetStringFailGraphTypeNotSupported) 
+{ 
+        const char *ret_status_str;
+        const std::string type_not_supported = "nvGRAPH graph type not supported";
+        ret_status_str = nvgraphStatusGetString(NVGRAPH_STATUS_GRAPH_TYPE_NOT_SUPPORTED);
+        ASSERT_EQ( ret_status_str, type_not_supported);
+}
+
+TEST_F(NvgraphAPITest,NvgraphStatusGetStringFailUnknownNvgraphStatus) 
+{
+        const char *ret_status_str;
+        const std::string unknown_nvgraph_status = "Unknown nvGRAPH Status";
+        ret_status_str = nvgraphStatusGetString((nvgraphStatus_t)11);
+        ASSERT_EQ( ret_status_str, unknown_nvgraph_status);
+}
+
+TEST_F(NvgraphAPITest,NvgraphCreateGraphDescr)
+{
+        nvgraphGraphDescr_t G=NULL;
+        status = nvgraphCreateGraphDescr(handle, &G);  
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+}
+
+TEST_F(NvgraphAPITest,NvgraphCreateDestroyGraphDescr)
+{
+        nvgraphGraphDescr_t G=NULL;
+        status = nvgraphCreateGraphDescr(handle, &G);  
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphDestroyGraphDescr(handle, G);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+}
+
+TEST_F(NvgraphAPITest,NvgraphCreateDestroyGraphDescr_CornerCases)
+{
+    nvgraphGraphDescr_t G = NULL;
+    status = nvgraphDestroyGraphDescr(handle, G);
+    ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+}
+
+TEST_F(NvgraphAPITest,NvgraphGraphDescrSetCSRTopology)
+{
+    nvgraphGraphDescr_t descrG=NULL;
+    status = nvgraphCreateGraphDescr(handle, &descrG);  
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+    nvgraphCSRTopology32I_st topoData;
+    topoData.nvertices = 0;
+    topoData.nedges = 0;
+    topoData.source_offsets = NULL;
+    topoData.destination_indices = NULL;
+
+    // Bad topology, missing all entries, should fail
+    status=nvgraphSetGraphStructure(handle, descrG, (void *)&topoData, NVGRAPH_CSR_32);
+    ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+
+    topoData.nvertices = 4;
+    topoData.nedges = 4;
+
+    // Bad topology, missing all offsets and indices, should fail
+    status=nvgraphSetGraphStructure(handle, descrG, (void *)&topoData, NVGRAPH_CSR_32);
+    ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+
+    int offsets[6];  //{0,1,3,4,5,5};
+    offsets[0] = 0;
+    offsets[1] = 1;
+    offsets[2] = 3;
+    offsets[3] = 4;
+    offsets[4] = 5;
+    offsets[5] = 5;
+    topoData.source_offsets= offsets;
+         
+    // Bad topology, missing destination_indices, should fail
+    status=nvgraphSetGraphStructure(handle, descrG, (void *)&topoData, NVGRAPH_CSR_32);
+    ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+
+    int indices[4];
+    indices[0] = 1;
+    indices[0] = 2;
+    indices[0] = 3;
+    indices[0] = 4;
+    topoData.destination_indices = indices;
+    // Should be ok now
+    status=nvgraphSetGraphStructure(handle, descrG, (void *)&topoData, NVGRAPH_CSR_32);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+    status = nvgraphDestroyGraphDescr(handle, descrG);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+}
+
+
+TEST_F(NvgraphAPITest,NvgraphGraphDescrSetGetTopologyCSR)
+{
+    nvgraphGraphDescr_t descrG=NULL;
+    status = nvgraphCreateGraphDescr(handle, &descrG);  
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+  
+    // 1, 0, 0, 0, 0, 0, 0
+    // 0, 1, 0, 0, 0, 0, 0
+    // 0, 0, 0, 0, 0, 0, 0
+    // 1, 0, 0, 0, 0, 0, 1
+    // 1, 1, 1, 0, 0, 0, 0
+    // 0, 0, 0, 0, 0, 0, 0
+    // 1, 1, 1, 0, 0, 0, 1
+    // indptr=[0  1  2  2  4  7  7  11] // 8
+    // indices=[0  1  0  6  0  1  2  0  1  2  6] // 11
+    // n=7
+    // nnz=11
+    int rowPtr[] = {0, 1, 2, 2, 4, 7, 7, 11};
+    int colInd[] = {0, 1, 0, 6, 0, 1, 2, 0, 1, 2, 6};
+    
+    nvgraphCSRTopology32I_st topoData;
+    topoData.nedges = 11; // nnz
+    topoData.nvertices = 7; // n
+    topoData.source_offsets = rowPtr;
+    topoData.destination_indices = colInd;
+
+    status=nvgraphSetGraphStructure(handle, descrG, (void *)&topoData, NVGRAPH_CSR_32);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    
+    status=nvgraphGetGraphStructure(handle, descrG, NULL, NULL);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+    // Check TType return value
+    nvgraphTopologyType_t TType;
+    status=nvgraphGetGraphStructure(handle, descrG, NULL, &TType);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    ASSERT_EQ(NVGRAPH_CSR_32, TType);
+
+    // Check topoGet nedges and nvertices
+    nvgraphCSRTopology32I_st topoDataGet;
+    topoDataGet.nvertices=0;
+    topoDataGet.nedges=0;
+    topoDataGet.source_offsets=NULL;
+    topoDataGet.destination_indices=NULL;
+    status=nvgraphGetGraphStructure(handle, descrG, (void *)&topoDataGet, NULL);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    ASSERT_EQ(topoData.nvertices, topoDataGet.nvertices);
+    ASSERT_EQ(topoData.nedges, topoDataGet.nedges);
+
+    // Check topoGet nedges, nvertices and offsets
+    topoDataGet.nvertices=0;
+    topoDataGet.nedges=0;
+    int rowPtrGet[8];
+    rowPtrGet[0]=0;
+    rowPtrGet[1]=0;
+    rowPtrGet[2]=0;
+    rowPtrGet[3]=0;
+    rowPtrGet[4]=0;
+    rowPtrGet[5]=0;
+    rowPtrGet[6]=0;
+    rowPtrGet[7]=0;
+    topoDataGet.source_offsets=rowPtrGet;
+    topoDataGet.destination_indices=NULL;
+    status=nvgraphGetGraphStructure(handle, descrG, (void *)&topoDataGet, NULL);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    ASSERT_EQ(topoData.nvertices, topoDataGet.nvertices);
+    ASSERT_EQ(topoData.nedges, topoDataGet.nedges);
+    ASSERT_EQ(rowPtr[0], rowPtrGet[0]);
+    ASSERT_EQ(rowPtr[1], rowPtrGet[1]);
+    ASSERT_EQ(rowPtr[2], rowPtrGet[2]);
+    ASSERT_EQ(rowPtr[3], rowPtrGet[3]);
+    ASSERT_EQ(rowPtr[4], rowPtrGet[4]);
+    ASSERT_EQ(rowPtr[5], rowPtrGet[5]);
+    ASSERT_EQ(rowPtr[6], rowPtrGet[6]);
+    ASSERT_EQ(rowPtr[7], rowPtrGet[7]);
+
+    // Check topoGet
+    topoDataGet.nvertices=0;
+    topoDataGet.nedges=0;
+    rowPtrGet[0]=0;
+    rowPtrGet[1]=0;
+    rowPtrGet[2]=0;
+    rowPtrGet[3]=0;
+    rowPtrGet[4]=0;
+    rowPtrGet[5]=0;
+    rowPtrGet[6]=0;
+    rowPtrGet[7]=0;
+    int colIndGet[11];
+    colIndGet[0]=0;
+    colIndGet[1]=0;
+    colIndGet[2]=0;
+    colIndGet[3]=0;
+    colIndGet[4]=0;
+    colIndGet[5]=0;
+    colIndGet[6]=0;
+    colIndGet[7]=0;
+    colIndGet[8]=0;
+    colIndGet[9]=0;
+    colIndGet[10]=0;
+    topoDataGet.source_offsets=rowPtrGet;
+    topoDataGet.destination_indices=colIndGet;
+    status=nvgraphGetGraphStructure(handle, descrG, (void *)&topoDataGet, NULL);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    ASSERT_EQ(topoData.nvertices, topoDataGet.nvertices);
+    ASSERT_EQ(topoData.nedges, topoDataGet.nedges);
+    ASSERT_EQ(rowPtr[0], rowPtrGet[0]);
+    ASSERT_EQ(rowPtr[1], rowPtrGet[1]);
+    ASSERT_EQ(rowPtr[2], rowPtrGet[2]);
+    ASSERT_EQ(rowPtr[3], rowPtrGet[3]);
+    ASSERT_EQ(rowPtr[4], rowPtrGet[4]);
+    ASSERT_EQ(rowPtr[5], rowPtrGet[5]);
+    ASSERT_EQ(rowPtr[6], rowPtrGet[6]);
+    ASSERT_EQ(rowPtr[7], rowPtrGet[7]);
+    ASSERT_EQ(colInd[0], colIndGet[0]);
+    ASSERT_EQ(colInd[1], colIndGet[1]);
+    ASSERT_EQ(colInd[2], colIndGet[2]);
+    ASSERT_EQ(colInd[3], colIndGet[3]);
+    ASSERT_EQ(colInd[4], colIndGet[4]);
+    ASSERT_EQ(colInd[5], colIndGet[5]);
+    ASSERT_EQ(colInd[6], colIndGet[6]);
+    ASSERT_EQ(colInd[7], colIndGet[7]);
+    ASSERT_EQ(colInd[8], colIndGet[8]);
+    ASSERT_EQ(colInd[9], colIndGet[9]);
+    ASSERT_EQ(colInd[10], colIndGet[10]);
+
+    // Check all
+    TType=NVGRAPH_CSC_32;
+    topoDataGet.nvertices=0;
+    topoDataGet.nedges=0;
+    rowPtrGet[0]=0;
+    rowPtrGet[1]=0;
+    rowPtrGet[2]=0;
+    rowPtrGet[3]=0;
+    rowPtrGet[4]=0;
+    rowPtrGet[5]=0;
+    rowPtrGet[6]=0;
+    rowPtrGet[7]=0;
+    colIndGet[0]=0;
+    colIndGet[1]=0;
+    colIndGet[2]=0;
+    colIndGet[3]=0;
+    colIndGet[4]=0;
+    colIndGet[5]=0;
+    colIndGet[6]=0;
+    colIndGet[7]=0;
+    colIndGet[8]=0;
+    colIndGet[9]=0;
+    colIndGet[10]=0;
+    topoDataGet.source_offsets=rowPtrGet;
+    topoDataGet.destination_indices=colIndGet;
+    status=nvgraphGetGraphStructure(handle, descrG, (void *)&topoDataGet, &TType);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    ASSERT_EQ(NVGRAPH_CSR_32, TType);
+    ASSERT_EQ(topoData.nvertices, topoDataGet.nvertices);
+    ASSERT_EQ(topoData.nedges, topoDataGet.nedges);
+    ASSERT_EQ(rowPtr[0], rowPtrGet[0]);
+    ASSERT_EQ(rowPtr[1], rowPtrGet[1]);
+    ASSERT_EQ(rowPtr[2], rowPtrGet[2]);
+    ASSERT_EQ(rowPtr[3], rowPtrGet[3]);
+    ASSERT_EQ(rowPtr[4], rowPtrGet[4]);
+    ASSERT_EQ(rowPtr[5], rowPtrGet[5]);
+    ASSERT_EQ(rowPtr[6], rowPtrGet[6]);
+    ASSERT_EQ(rowPtr[7], rowPtrGet[7]);
+    ASSERT_EQ(colInd[0], colIndGet[0]);
+    ASSERT_EQ(colInd[1], colIndGet[1]);
+    ASSERT_EQ(colInd[2], colIndGet[2]);
+    ASSERT_EQ(colInd[3], colIndGet[3]);
+    ASSERT_EQ(colInd[4], colIndGet[4]);
+    ASSERT_EQ(colInd[5], colIndGet[5]);
+    ASSERT_EQ(colInd[6], colIndGet[6]);
+    ASSERT_EQ(colInd[7], colIndGet[7]);
+    ASSERT_EQ(colInd[8], colIndGet[8]);
+    ASSERT_EQ(colInd[9], colIndGet[9]);
+    ASSERT_EQ(colInd[10], colIndGet[10]);
+
+    status = nvgraphDestroyGraphDescr(handle,descrG);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+}
+
+TEST_F(NvgraphAPITest,NvgraphGraphDescrSetGetTopologyCSC)
+{
+    nvgraphGraphDescr_t descrG=NULL;
+    status = nvgraphCreateGraphDescr(handle, &descrG);  
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+  
+    // 1, 0, 0, 0, 0, 0, 0
+    // 0, 1, 0, 0, 0, 0, 0
+    // 0, 0, 0, 0, 0, 0, 0
+    // 1, 0, 0, 0, 0, 0, 1
+    // 1, 1, 1, 0, 0, 0, 0
+    // 0, 0, 0, 0, 0, 0, 0
+    // 1, 1, 1, 0, 0, 0, 1
+    // offsets=[0  4  7  9  9  9  9  11]
+    // indices=[0  3  4  6  1  4  6  4  6  3  6]
+    // n=7
+    // nnz=11
+    int rowInd[] = {0, 3, 4, 6, 1, 4, 6, 4, 6, 3, 6};
+    int colPtr[] = {0, 4, 7, 9, 9, 9, 9, 11};
+    
+    nvgraphCSCTopology32I_st topoData;
+    topoData.nedges = 11; // nnz
+    topoData.nvertices = 7; // n
+    topoData.destination_offsets = colPtr;
+    topoData.source_indices = rowInd;
+
+    status=nvgraphSetGraphStructure(handle, descrG, (void *)&topoData, NVGRAPH_CSR_32);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+    status=nvgraphGetGraphStructure(handle, descrG, NULL, NULL);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+    // Check TType return value
+    nvgraphTopologyType_t TType;
+    status=nvgraphGetGraphStructure(handle, descrG, NULL, &TType);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    ASSERT_EQ(NVGRAPH_CSR_32, TType);
+
+    // Check topoGet nedges and nvertices
+    nvgraphCSCTopology32I_st topoDataGet;
+    topoDataGet.nvertices=0;
+    topoDataGet.nedges=0;
+    topoDataGet.destination_offsets=NULL;
+    topoDataGet.source_indices=NULL;
+    status=nvgraphGetGraphStructure(handle, descrG, (void *)&topoDataGet, NULL);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    ASSERT_EQ(topoData.nvertices, topoDataGet.nvertices);
+    ASSERT_EQ(topoData.nedges, topoDataGet.nedges);
+
+    // Check topoGet nedges, nvertices and offsets
+    topoDataGet.nvertices=0;
+    topoDataGet.nedges=0;
+    int colPtrGet[8];
+    colPtrGet[0]=0;
+    colPtrGet[1]=0;
+    colPtrGet[2]=0;
+    colPtrGet[3]=0;
+    colPtrGet[4]=0;
+    colPtrGet[5]=0;
+    colPtrGet[6]=0;
+    colPtrGet[7]=0;
+    topoDataGet.destination_offsets=colPtrGet;
+    topoDataGet.source_indices=NULL;
+    status=nvgraphGetGraphStructure(handle, descrG, (void *)&topoDataGet, NULL);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    ASSERT_EQ(topoData.nvertices, topoDataGet.nvertices);
+    ASSERT_EQ(topoData.nedges, topoDataGet.nedges);
+    ASSERT_EQ(colPtr[0], colPtrGet[0]);
+    ASSERT_EQ(colPtr[1], colPtrGet[1]);
+    ASSERT_EQ(colPtr[2], colPtrGet[2]);
+    ASSERT_EQ(colPtr[3], colPtrGet[3]);
+    ASSERT_EQ(colPtr[4], colPtrGet[4]);
+    ASSERT_EQ(colPtr[5], colPtrGet[5]);
+    ASSERT_EQ(colPtr[6], colPtrGet[6]);
+    ASSERT_EQ(colPtr[7], colPtrGet[7]);
+
+    // Check topoGet
+    topoDataGet.nvertices=0;
+    topoDataGet.nedges=0;
+    colPtrGet[0]=0;
+    colPtrGet[1]=0;
+    colPtrGet[2]=0;
+    colPtrGet[3]=0;
+    colPtrGet[4]=0;
+    colPtrGet[5]=0;
+    colPtrGet[6]=0;
+    colPtrGet[7]=0;
+    int rowIndGet[11];
+    rowIndGet[0]=0;
+    rowIndGet[1]=0;
+    rowIndGet[2]=0;
+    rowIndGet[3]=0;
+    rowIndGet[4]=0;
+    rowIndGet[5]=0;
+    rowIndGet[6]=0;
+    rowIndGet[7]=0;
+    rowIndGet[8]=0;
+    rowIndGet[9]=0;
+    rowIndGet[10]=0;
+    topoDataGet.destination_offsets=colPtrGet;
+    topoDataGet.source_indices=rowIndGet;
+    status=nvgraphGetGraphStructure(handle, descrG, (void *)&topoDataGet, NULL);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    ASSERT_EQ(topoData.nvertices, topoDataGet.nvertices);
+    ASSERT_EQ(topoData.nedges, topoDataGet.nedges);
+    ASSERT_EQ(colPtr[0], colPtrGet[0]);
+    ASSERT_EQ(colPtr[1], colPtrGet[1]);
+    ASSERT_EQ(colPtr[2], colPtrGet[2]);
+    ASSERT_EQ(colPtr[3], colPtrGet[3]);
+    ASSERT_EQ(colPtr[4], colPtrGet[4]);
+    ASSERT_EQ(colPtr[5], colPtrGet[5]);
+    ASSERT_EQ(colPtr[6], colPtrGet[6]);
+    ASSERT_EQ(colPtr[7], colPtrGet[7]);
+    ASSERT_EQ(rowInd[0], rowIndGet[0]);
+    ASSERT_EQ(rowInd[1], rowIndGet[1]);
+    ASSERT_EQ(rowInd[2], rowIndGet[2]);
+    ASSERT_EQ(rowInd[3], rowIndGet[3]);
+    ASSERT_EQ(rowInd[4], rowIndGet[4]);
+    ASSERT_EQ(rowInd[5], rowIndGet[5]);
+    ASSERT_EQ(rowInd[6], rowIndGet[6]);
+    ASSERT_EQ(rowInd[7], rowIndGet[7]);
+    ASSERT_EQ(rowInd[8], rowIndGet[8]);
+    ASSERT_EQ(rowInd[9], rowIndGet[9]);
+    ASSERT_EQ(rowInd[10], rowIndGet[10]);
+
+    // Check all
+    TType=NVGRAPH_CSC_32;
+    topoDataGet.nvertices=0;
+    topoDataGet.nedges=0;
+    colPtrGet[0]=0;
+    colPtrGet[1]=0;
+    colPtrGet[2]=0;
+    colPtrGet[3]=0;
+    colPtrGet[4]=0;
+    colPtrGet[5]=0;
+    colPtrGet[6]=0;
+    colPtrGet[7]=0;
+    rowIndGet[0]=0;
+    rowIndGet[1]=0;
+    rowIndGet[2]=0;
+    rowIndGet[3]=0;
+    rowIndGet[4]=0;
+    rowIndGet[5]=0;
+    rowIndGet[6]=0;
+    rowIndGet[7]=0;
+    rowIndGet[8]=0;
+    rowIndGet[9]=0;
+    rowIndGet[10]=0;
+    topoDataGet.destination_offsets=colPtrGet;
+    topoDataGet.source_indices=rowIndGet;
+    status=nvgraphGetGraphStructure(handle, descrG, (void *)&topoDataGet, &TType);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    ASSERT_EQ(NVGRAPH_CSR_32, TType);
+    ASSERT_EQ(topoData.nvertices, topoDataGet.nvertices);
+    ASSERT_EQ(topoData.nedges, topoDataGet.nedges);
+    ASSERT_EQ(colPtr[0], colPtrGet[0]);
+    ASSERT_EQ(colPtr[1], colPtrGet[1]);
+    ASSERT_EQ(colPtr[2], colPtrGet[2]);
+    ASSERT_EQ(colPtr[3], colPtrGet[3]);
+    ASSERT_EQ(colPtr[4], colPtrGet[4]);
+    ASSERT_EQ(colPtr[5], colPtrGet[5]);
+    ASSERT_EQ(colPtr[6], colPtrGet[6]);
+    ASSERT_EQ(colPtr[7], colPtrGet[7]);
+    ASSERT_EQ(rowInd[0], rowIndGet[0]);
+    ASSERT_EQ(rowInd[1], rowIndGet[1]);
+    ASSERT_EQ(rowInd[2], rowIndGet[2]);
+    ASSERT_EQ(rowInd[3], rowIndGet[3]);
+    ASSERT_EQ(rowInd[4], rowIndGet[4]);
+    ASSERT_EQ(rowInd[5], rowIndGet[5]);
+    ASSERT_EQ(rowInd[6], rowIndGet[6]);
+    ASSERT_EQ(rowInd[7], rowIndGet[7]);
+    ASSERT_EQ(rowInd[8], rowIndGet[8]);
+    ASSERT_EQ(rowInd[9], rowIndGet[9]);
+    ASSERT_EQ(rowInd[10], rowIndGet[10]);
+
+    status = nvgraphDestroyGraphDescr(handle,descrG);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+}
+
+TEST_F(NvgraphAPITest,NvgraphGraphDescrSetGetVertexDataSingleFloat)
+{
+    typedef float T;
+
+    nvgraphGraphDescr_t descrG=NULL;
+    status = nvgraphCreateGraphDescr(handle, &descrG);  
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+ 
+    /* Create topology before we load data */
+    createTopo();
+
+    status = nvgraphSetGraphStructure(handle, descrG, (void *)&topoData, NVGRAPH_CSR_32);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        T *vertexvals;
+        vertexvals = (T *) malloc(4*sizeof(T));
+        vertexvals[0]=0.1;
+        vertexvals[1]=2.0;
+        vertexvals[2]=3.14;
+        vertexvals[3]=0;
+
+//        size_t numsets=1;
+
+        cudaDataType_t type_v[1] = {sizeof(T) > 4 ? CUDA_R_64F : CUDA_R_32F};
+
+        status = nvgraphAllocateVertexData(handle, descrG, 1, type_v);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        status = nvgraphSetVertexData(handle, descrG, (void *)vertexvals, 0);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        T *getvals;
+        getvals = (T *)malloc(4*sizeof(T));       
+       
+        status = nvgraphGetVertexData(handle, descrG, (void *)getvals, 0);
+        ASSERT_EQ( getvals[0], vertexvals[0]);
+        ASSERT_EQ( getvals[1], vertexvals[1]);
+        ASSERT_EQ( getvals[2], vertexvals[2]);
+        ASSERT_EQ( getvals[3], vertexvals[3]);
+
+        free(vertexvals);
+        free(getvals);
+  
+        status = nvgraphDestroyGraphDescr(handle,descrG);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+}
+
+
+TEST_F(NvgraphAPITest,NvgraphSetGetVertexDataSingleDouble)
+{
+    typedef double T;
+
+        nvgraphGraphDescr_t descrG=NULL;
+        status = nvgraphCreateGraphDescr(handle, &descrG);  
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+    /* Create topology before we load data */
+    createTopo();
+    status = nvgraphSetGraphStructure(handle, descrG, (void *)&topoData, NVGRAPH_CSR_32);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        T *vertexvals;
+        vertexvals = (T *) malloc(4*sizeof(T));
+        vertexvals[0]=0.1;
+        vertexvals[1]=2.0;
+        vertexvals[2]=3.14;
+        vertexvals[3]=0;
+
+//        size_t numsets=1;
+
+        cudaDataType_t type_v[1] = {sizeof(T) > 4 ? CUDA_R_64F : CUDA_R_32F};
+
+        status = nvgraphAllocateVertexData(handle, descrG, 1, type_v);
+//        nvgraph::Graph<int> *G = static_cast<nvgraph::Graph<int>*> (descrG->graph_handle);
+
+        //status = nvgraphSetVertexData(handle, descrG, (void **)&vertexvals, numsets, type_v );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetVertexData(handle, descrG, (void *)vertexvals, 0 );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        T *getvals;
+        getvals = (T *)malloc(4*sizeof(T));       
+       
+        status = nvgraphGetVertexData(handle, descrG, (void *)getvals, 0);
+        ASSERT_EQ( getvals[0], vertexvals[0]);
+        ASSERT_EQ( getvals[1], vertexvals[1]);
+        ASSERT_EQ( getvals[2], vertexvals[2]);
+        ASSERT_EQ( getvals[3], vertexvals[3]);
+
+        free(vertexvals);
+        free(getvals);
+  
+        status = nvgraphDestroyGraphDescr(handle,descrG);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+}
+
+TEST_F(NvgraphAPITest,NvgraphSetGetVertexData_CornerCases)
+{
+    nvgraphGraphDescr_t descrG=NULL;
+    status = nvgraphCreateGraphDescr(handle, &descrG);  
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+    /* Create topology before we load data */
+    createTopo();
+    status = nvgraphSetGraphStructure(handle, descrG, (void *)&topoData, NVGRAPH_CSR_32);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        double vertexvals0[2] = {0.1, 1e21};
+        float vertexvals1[2] = {0.1f, 1e21f};
+        void* vertexptr[2] = {(void*) vertexvals0, (void*)vertexvals1};
+
+        size_t numsets=2;
+
+        cudaDataType_t type_v[2] = {CUDA_R_64F, CUDA_R_32F};
+        status = nvgraphAllocateVertexData(handle, descrG, 1,  type_v);
+
+        status = nvgraphSetVertexData(NULL, descrG, (void *)vertexptr[0], 0 );
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphSetVertexData(handle, NULL, (void *)vertexptr[0], 0 );
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphSetVertexData(handle, descrG, NULL, numsets );
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        // probably should be a success
+        status = nvgraphSetVertexData(handle, descrG, (void **)&vertexptr, 0 );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        status = nvgraphSetVertexData(handle, descrG, (void **)&vertexptr, numsets );
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+
+        {
+            // type mismatch
+//            double edge_data0 = 0.;
+//            float edge_data1 =1.;
+//            void* edge_ptr_bad[] = {(void*)&edge_data0, (void*)&edge_data1};
+//            cudaDataType_t type_bad[2] = {CUDA_R_32F, CUDA_R_32F};
+            //status = nvgraphSetEdgeData(handle, descrG, (void **)edge_ptr_bad, numsets );
+            ASSERT_NE(NVGRAPH_STATUS_SUCCESS, status);
+        }
+
+        float getdoublevals0[2];
+//        double getdoublevals1[2];
+        status = nvgraphGetVertexData(NULL, descrG, (void *)getdoublevals0, 0);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphGetVertexData(handle, NULL, (void *)getdoublevals0, 0);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphGetVertexData(handle, descrG, (void *)NULL, 0);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphGetVertexData(handle, descrG, (void *)getdoublevals0, 10);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+  
+        status = nvgraphDestroyGraphDescr(handle,descrG);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+}
+
+
+TEST_F(NvgraphAPITest,NvgraphSetGetVertexDataMulti)
+{
+        nvgraphGraphDescr_t descrG=NULL;
+        status = nvgraphCreateGraphDescr(handle, &descrG);  
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+    /* Create topology data */
+    createTopo();
+    status = nvgraphSetGraphStructure(handle, descrG, (void *)&topoData, NVGRAPH_CSR_32);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+//        size_t numsets=3;
+        cudaDataType_t type_v[3] = {CUDA_R_32F, CUDA_R_64F, CUDA_R_32F};
+
+        void **vertexvals;
+        vertexvals = (void **)malloc(3*sizeof( void * ));
+        vertexvals[0] = (float *) malloc(4*sizeof(float));
+        ((float *)vertexvals[0])[0]=0.1;
+        ((float *)vertexvals[0])[1]=2.0;
+        ((float *)vertexvals[0])[2]=3.14;
+        ((float *)vertexvals[0])[3]=0;
+
+        vertexvals[1] = (double *)malloc(4*sizeof(double));
+        ((double *)vertexvals[1])[0]=1.1e-10;
+        ((double *)vertexvals[1])[1]=2.0e20;
+        ((double *)vertexvals[1])[2]=3.14e-26;
+        ((double *)vertexvals[1])[3]=0.34e3;
+
+        vertexvals[2] = (float *)malloc(4*sizeof(float));
+        ((float *)vertexvals[2])[0]=1.1e-1;
+        ((float *)vertexvals[2])[1]=2.0e2;
+        ((float *)vertexvals[2])[2]=3.14e-2;
+        ((float *)vertexvals[2])[3]=0.34e6;
+
+        status = nvgraphAllocateVertexData(handle, descrG, 1,  type_v);
+
+        float *getfloatvals;
+        getfloatvals = (float *)malloc(4*sizeof(float));       
+       
+        status = nvgraphSetVertexData(handle, descrG, (void *)vertexvals[0], 0);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        status = nvgraphGetVertexData(handle, descrG, (void *)getfloatvals, 0);
+        float *float_data=((float *)vertexvals[0]);
+        ASSERT_EQ( (float)getfloatvals[0], float_data[0]);
+        ASSERT_EQ( (float)getfloatvals[1], float_data[1]);
+        ASSERT_EQ( (float)getfloatvals[2], float_data[2]);
+        ASSERT_EQ( (float)getfloatvals[3], float_data[3]);
+
+        double *getdoublevals;
+        getdoublevals = (double *)malloc(4*sizeof(double));       
+
+        status = nvgraphSetVertexData(handle, descrG, (void *)vertexvals[1], 1);
+
+        status = nvgraphGetVertexData(handle, descrG, (void *)getdoublevals, 1);
+//        double *double_data=((double *)vertexvals[1]);
+        //ASSERT_EQ( (double)getdoublevals[0], double_data[0]);
+        //ASSERT_EQ( (double)getdoublevals[1], double_data[1]);
+        //ASSERT_EQ( (double)getdoublevals[2], double_data[2]);
+        //ASSERT_EQ( (double)getdoublevals[3], double_data[3]);
+     
+        free(vertexvals[0]); 
+        free(vertexvals[1]); 
+        free(vertexvals[2]); 
+        free(vertexvals);
+        free(getfloatvals);
+        free(getdoublevals);
+  
+        status = nvgraphDestroyGraphDescr(handle,descrG);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+}
+
+
+TEST_F(NvgraphAPITest,NvgraphSetGetEdgeDataSingleFloat)
+{
+    typedef float T;
+
+        nvgraphGraphDescr_t descrG=NULL;
+        status = nvgraphCreateGraphDescr(handle, &descrG);  
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+    /* Create topology */
+    createTopo();
+    status = nvgraphSetGraphStructure(handle, descrG, (void *)&topoData, NVGRAPH_CSR_32);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        T *edgevals;
+        edgevals = (T *) malloc(5*sizeof(T));
+        edgevals[0]=0.1;
+        edgevals[1]=2.0;
+        edgevals[2]=3.14;
+        edgevals[3]=0;
+        edgevals[4]=10101.10101;
+
+//        size_t numsets=1;
+
+        cudaDataType_t type_v[1] = {sizeof(T) > 4 ? CUDA_R_64F : CUDA_R_32F};
+
+        status = nvgraphAllocateEdgeData(handle, descrG, 1,  type_v);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetEdgeData(handle, descrG, (void *)edgevals, 0 );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        T *getvals;
+        getvals = (T *)malloc(5*sizeof(T));       
+
+        status = nvgraphGetEdgeData(handle, descrG, (void *)getvals, 0);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+    ASSERT_EQ( getvals[0], edgevals[0]);
+    ASSERT_EQ( getvals[1], edgevals[1]);
+    ASSERT_EQ( getvals[2], edgevals[2]);
+    ASSERT_EQ( getvals[3], edgevals[3]);
+    ASSERT_EQ( getvals[4], edgevals[4]);
+ 
+        free(edgevals);
+        free(getvals);
+  
+        status = nvgraphDestroyGraphDescr(handle,descrG);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+}
+
+
+TEST_F(NvgraphAPITest,NvgraphSetGetEdgeDataSingleDouble)
+{
+    typedef double T;
+
+    nvgraphGraphDescr_t descrG=NULL;
+    status = nvgraphCreateGraphDescr(handle, &descrG);  
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+    /* Create topology */
+    createTopo();
+    status = nvgraphSetGraphStructure(handle, descrG, (void *)&topoData, NVGRAPH_CSR_32);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+    
+
+        T *edgevals;
+        edgevals = (T *) malloc(5*sizeof(T));
+        edgevals[0]=0.1;
+        edgevals[1]=2.0;
+        edgevals[2]=3.14;
+        edgevals[3]=0;
+        edgevals[4]=10101.10101;
+
+//        size_t numsets=1;
+
+        cudaDataType_t type_v[1] = {sizeof(T) > 4 ? CUDA_R_64F : CUDA_R_32F};
+
+        status = nvgraphAllocateEdgeData(handle, descrG, 1, type_v);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetEdgeData(handle, descrG, (void *)edgevals, 0 );
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        T *getvals;
+        getvals = (T *)malloc(5*sizeof(T));       
+        status = nvgraphGetEdgeData(handle, descrG, (void *)getvals, 0);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+    ASSERT_EQ( getvals[0], edgevals[0]);
+    ASSERT_EQ( getvals[1], edgevals[1]);
+    ASSERT_EQ( getvals[2], edgevals[2]);
+    ASSERT_EQ( getvals[3], edgevals[3]);
+    ASSERT_EQ( getvals[4], edgevals[4]);
+ 
+        free(edgevals);
+        free(getvals);
+  
+        status = nvgraphDestroyGraphDescr(handle,descrG);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+}
+
+
+TEST_F(NvgraphAPITest,NvgraphSetGetEdgeData_CornerCases)
+{
+        nvgraphGraphDescr_t descrG=NULL;
+        status = nvgraphCreateGraphDescr(handle, &descrG);  
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+    /* Create topology */
+    createTopo();
+    status = nvgraphSetGraphStructure(handle, descrG, (void *)&topoData, NVGRAPH_CSR_32);
+    ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+
+        double edgevals0[1] = {0.1};
+        float edgevals1[1] = {0.1f};
+        void* edgeptr[2] = {(void*) edgevals0, (void*)edgevals1};
+
+//        size_t numsets=2;
+
+        cudaDataType_t type_e[2] = {CUDA_R_64F, CUDA_R_32F};
+  
+        status = nvgraphAllocateEdgeData(handle, descrG, 1, type_e);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+        status = nvgraphSetEdgeData(NULL, descrG, edgeptr, 0);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphSetEdgeData(handle, NULL, edgeptr, 0);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        status = nvgraphSetEdgeData(handle, descrG, NULL, 0);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        //status = nvgraphSetEdgeData(handle, descrG, edgeptr, 0);
+        //ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+
+        {
+            // type mismatch
+//            double vertexvals0[2] = {0.1, 1e21};
+//            float vertexvals1[2] = {0.1f, 1e21f};
+//            void* vertexptr_bad[2] = {(void*) vertexvals0, (void*)vertexvals1};
+        
+//            cudaDataType_t type_bad[2] = {CUDA_R_32F, CUDA_R_32F};
+            //status = nvgraphSetVertexData(handle, descrG, (void **)vertexptr_bad, numsets, type_bad );
+            ASSERT_NE(NVGRAPH_STATUS_SUCCESS, status);
+        }
+
+//        float getdoublevals0[2];
+//        double getdoublevals1[2];
+        //status = nvgraphGetEdgeData(NULL, descrG, (void *)getdoublevals0, 0);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        //status = nvgraphGetEdgeData(handle, NULL, (void *)getdoublevals0, 0);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        //status = nvgraphGetEdgeData(handle, descrG, NULL, 0);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+        //status = nvgraphGetEdgeData(handle, descrG, (void *)getdoublevals0, 10);
+        ASSERT_EQ(NVGRAPH_STATUS_INVALID_VALUE, status);
+  
+        status = nvgraphDestroyGraphDescr(handle,descrG);
+        ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, status);
+}
+
+int main(int argc, char **argv) 
+{
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/cpp/nvgraph/cpp/tests/nvgraph_test_common.h b/cpp/nvgraph/cpp/tests/nvgraph_test_common.h
new file mode 100644
index 00000000000..29a8808ab3d
--- /dev/null
+++ b/cpp/nvgraph/cpp/tests/nvgraph_test_common.h
@@ -0,0 +1,121 @@
+#include <stdlib.h>   /* import labs() */
+#include <math.h>
+
+#include <iostream>
+#include <string>
+
+#if defined(_WIN32)
+#if !defined(WIN32_LEAN_AND_MEAN)
+#define WIN32_LEAN_AND_MEAN
+#endif
+#define NOMINMAX
+#include <windows.h>
+static double second (void)
+{
+    LARGE_INTEGER t;
+    static double oofreq;
+    static int checkedForHighResTimer;
+    static BOOL hasHighResTimer;
+
+    if (!checkedForHighResTimer) {
+        hasHighResTimer = QueryPerformanceFrequency (&t);
+        oofreq = 1.0 / (double)t.QuadPart;
+        checkedForHighResTimer = 1;
+    }
+    if (hasHighResTimer) {
+        QueryPerformanceCounter (&t);
+        return (double)t.QuadPart * oofreq;
+    } else {
+        return (double)GetTickCount() / 1000.0;
+    }
+}
+
+static long long getSystemMemory() 
+{ 
+    MEMORYSTATUSEX state; // Requires >= win2k 
+    memset (&state, 0, sizeof(state)); 
+    state.dwLength = sizeof(state); 
+    if (0 == GlobalMemoryStatusEx(&state)) { 
+        return 0; 
+    } else {
+        return (long long)state.ullTotalPhys; 
+    }
+} 
+#elif defined(__linux) || defined(__powerpc64__)
+#include <stddef.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/sysinfo.h>
+static double second (void)
+{
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
+}
+
+static long long getSystemMemory(void) 
+{ 
+    struct sysinfo s_info; 
+    sysinfo (&s_info); 
+    return (long long)s_info.totalram * (long long)s_info.mem_unit; 
+} 
+#elif defined(__APPLE__)
+#include <stddef.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
+static double second (void)
+{
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
+}
+
+static long long getSystemMemory(void) 
+{ 
+    int memmib[2] = { CTL_HW, HW_MEMSIZE };
+    long long mem = (size_t)0;
+    size_t memsz = sizeof(mem);
+
+    /* NOTE: This may cap memory reported at 2GB */
+    if (sysctl(memmib, 2, &mem, &memsz, NULL, 0) == -1) {
+        return 0;
+    } else {
+        return mem;
+    }
+}
+#elif defined(__QNX__)  
+#include <stddef.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+static double second (void)
+{
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
+}
+
+static long long getSystemMemory(void) 
+{ 
+    return 0;
+} 
+#else
+#error unsupported platform
+#endif
+
+std::string getFileName(const std::string& s) {
+
+   char sep = '/';
+
+#ifdef _WIN32
+   sep = '\\';
+#endif
+
+   size_t i = s.rfind(sep, s.length());
+   if (i != std::string::npos) {
+      return(s.substr(i+1, s.length() - i));
+   }
+
+   return("");
+}
diff --git a/cpp/nvgraph/cpp/tests/readMatrix.hxx b/cpp/nvgraph/cpp/tests/readMatrix.hxx
new file mode 100644
index 00000000000..01fb9f64197
--- /dev/null
+++ b/cpp/nvgraph/cpp/tests/readMatrix.hxx
@@ -0,0 +1,467 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <fstream>
+#include <sstream> //stringstream
+#include <string.h>
+#include <vector>
+#include <cstdlib> 
+#include <iomanip> 
+#include <algorithm>
+#include <cfloat>
+
+//Matrix Market COO reader-requires a call to sort in the test file
+template<typename IndexType_, typename ValueType_>
+struct Mat
+{
+
+	IndexType_ i;
+	IndexType_ j;
+	ValueType_ val;
+	bool transpose;
+	Mat() {
+	} //default cosntructor
+	Mat(bool transpose) :
+			transpose(transpose) {
+	} //pass in when comapring rows or columns
+	bool operator()(const Mat<IndexType_, ValueType_> &x1, const Mat<IndexType_, ValueType_> &x2)
+							{
+		if (!transpose)
+		{
+			if (x1.i == x2.i)
+				return x1.j < x2.j; //if rows equal sort by column index
+			return x1.i < x2.i;
+		}
+		else
+		{
+			if (x1.j == x2.j)
+				return x1.i < x2.i; //if rows equal sort by column index
+			return x1.j < x2.j;
+		}
+	}
+};
+template<typename ValueType_>
+void dump_host_dense_mat(std::vector<ValueType_>& v, int ld)
+									{
+	std::stringstream ss;
+	ss.str(std::string());
+	ss << std::setw(10);
+	ss.precision(3);
+	for (int i = 0; i < ld; ++i)
+			{
+		for (int j = 0; j < ld; ++j)
+				{
+			ss << v[i * ld + j] << std::setw(10);
+		}
+		ss << std::endl;
+	}
+	std::cout << ss.str();
+}
+
+/**
+ * Reads in graphs given in the "network" format. This format consists a
+ * row for each edge in the graph, giving its source and destination. There
+ * is no header or comment lines.
+ * @param filename The name of the file to read in.
+ * @param nnz The number of edges given in the file.
+ * @param src Vector to write out the sources to.
+ * @param dest Vector to write out the destinations to.
+ */
+template<typename IndexType>
+void readNetworkFile(const char * filename,
+							size_t nnz,
+							std::vector<IndexType>& src,
+							std::vector<IndexType>& dest) {
+	std::ifstream infile;
+	infile.open(filename);
+	src.resize(nnz);
+	dest.resize(nnz);
+	for (size_t i = 0; i < nnz; i++) {
+		infile >> src[i];
+		infile >> dest[i];
+	}
+	infile.close();
+	std::cout << "Read in " <<  nnz << " rows from: " << filename << "\n";
+}
+
+//reads the Matrix Market format from the florida collection of sparse matrices assuming
+//the first lines are comments beginning with %
+template<typename IndexType_, typename ValueType_>
+void readMatrixMarketFile(const char * filename,
+                          IndexType_ &m,
+                          IndexType_ &n,
+                          IndexType_ &nnz,
+                          std::vector<Mat<IndexType_, ValueType_> > &matrix,
+                          bool edges_only) {
+	std::ifstream infile;
+	infile.open(filename);
+	std::string line;
+	std::stringstream params;
+	while (1)
+	{
+		std::getline(infile, line);
+		//ignore initial comments that begin with %
+		if (line[0] != '%')
+				{
+			//first line without % for comments will have matrix size
+			params << line;
+			params >> n;
+			params >> m;
+			params >> nnz;
+			break; //break and then read in COO format
+		}
+	}
+	//COO format
+	matrix.resize(nnz);
+	//remaining file lines are tuples of row ind, col ind and possibly value
+	//sometimes value assumed to be one
+	for (int k = 0; k < nnz; ++k)
+			{
+		infile >> matrix[k].i;
+		infile >> matrix[k].j;
+		if (edges_only)
+			matrix[k].val = 1.0;
+		else
+			infile >> matrix[k].val;
+	}
+
+	infile.close();
+}
+//binary matrix reader functions
+void printUsageAndExit()
+{
+	printf("%s", "Usage:./csrmv_pl matrix_csr.bin\n");
+	printf("%s", "M is square, in Amgx binary format\n");
+
+	exit(0);
+}
+
+int read_header_amgx_csr_bin(FILE* fpin,
+										int & n,
+										int & nz
+										)
+										{
+	char text_header[255];
+	unsigned int system_flags[9];
+	size_t is_read1, is_read2;
+
+	is_read1 = fread(text_header, sizeof(char), strlen("%%NVAMGBinary\n"), fpin);
+	is_read2 = fread(system_flags, sizeof(unsigned int), 9, fpin);
+	if (!is_read1 || !is_read2)
+			{
+		printf("%s", "I/O fail\n");
+		return 1;
+	}
+
+	// We assume that system_flags [] = { 1, 1, whatever, 0, 0, 1, 1, n, nz };
+	/*
+	 bool is_mtx = system_flags[0];
+	 bool is_rhs = system_flags[1];
+	 bool is_soln = system_flags[2];
+	 unsigned idx_t matrix_format = system_flags[3];
+	 bool diag = system_flags[4];
+	 unsigned idx_t block_dimx = system_flags[5];
+	 unsigned idx_t block_dimy = system_flags[6];
+	 */
+
+	if (system_flags[0] != 1 || system_flags[1] != 1 ||
+			system_flags[3] != 0 || system_flags[4] != 0 || system_flags[5] != 1 ||
+			system_flags[6] != 1 || system_flags[7] < 1 || system_flags[8] < 1)
+
+			{
+		printf(	"Wrong format : system_flags [] != { 1(%d), 1(%d), 0(%d), 0(%d), 0(%d), 1(%d), 1(%d), n(%d), nz(%d) }\n\n",
+					system_flags[0],
+					system_flags[1],
+					system_flags[2],
+					system_flags[3],
+					system_flags[4],
+					system_flags[5],
+					system_flags[6],
+					system_flags[7],
+					system_flags[8]);
+		return 1;
+	}
+
+	n = system_flags[7];
+	nz = system_flags[8];
+	return 0;
+}
+
+//reader is for ints and double
+template<typename I>
+int read_csr_bin(FILE* fpin,
+						I &n,
+						I &nz,
+						std::vector<I> &row_ptr,
+						std::vector<I> &col_ind
+						)
+						{
+	size_t is_read1, is_read2, is_read3, is_read4;
+	is_read1 = fread(&n, sizeof(I), 1, fpin);
+	is_read2 = fread(&nz, sizeof(I), 1, fpin);
+	if (!is_read1 || !is_read2)
+			{
+		printf("%s", "I/O fail\n");
+		return 1;
+	}
+	row_ptr.resize(n + 1);
+	col_ind.resize(nz);
+	is_read3 = fread(&row_ptr[0], sizeof(I), n + 1, fpin);
+	is_read4 = fread(&col_ind[0], sizeof(I), nz, fpin);
+
+	if (!is_read3 || !is_read4)
+			{
+		printf("%s", "I/O fail\n");
+		return 1;
+	}
+	return 0;
+}
+
+//reader is for ints and double
+int read_data_amgx_csr_bin(FILE* fpin,
+									int n,
+									int nz,
+									std::vector<int> & row_ptr,
+									std::vector<int> & col_ind,
+									std::vector<double>& val
+									)
+									{
+	size_t is_read1, is_read2, is_read3;
+	is_read1 = fread(&row_ptr[0], sizeof(std::vector<int>::value_type), n + 1, fpin);
+	is_read2 = fread(&col_ind[0], sizeof(std::vector<int>::value_type), nz, fpin);
+	is_read3 = fread(&val[0], sizeof(std::vector<double>::value_type), nz, fpin);
+
+	if (!is_read1 || !is_read2 || !is_read3)
+			{
+		printf("%s", "I/O fail\n");
+		return 1;
+	}
+	return 0;
+}
+
+int read_data_amgx_csr_bin_rhs(FILE* fpin,
+											int n,
+											int nz,
+											std::vector<int> & row_ptr,
+											std::vector<int> & col_ind,
+											std::vector<double>& val,
+											std::vector<double>& rhs
+											)
+											{
+	size_t is_read1, is_read2, is_read3, is_read4;
+	is_read1 = fread(&row_ptr[0], sizeof(std::vector<int>::value_type), n + 1, fpin);
+	is_read2 = fread(&col_ind[0], sizeof(std::vector<int>::value_type), nz, fpin);
+	is_read3 = fread(&val[0], sizeof(std::vector<double>::value_type), nz, fpin);
+	is_read4 = fread(&rhs[0], sizeof(std::vector<double>::value_type), n, fpin);
+
+	if (!is_read1 || !is_read2 || !is_read3 || !is_read4)
+			{
+		printf("%s", "I/O fail\n");
+		return 1;
+	}
+	return 0;
+}
+
+//reader is for ints and double
+int read_data_amgx_csr_bin(FILE* fpin,
+									int n,
+									int nz,
+									std::vector<int> & row_ptr,
+									std::vector<int> & col_ind,
+									std::vector<float>& val
+									)
+									{
+	size_t is_read1, is_read2, is_read3;
+	is_read1 = fread(&row_ptr[0], sizeof(std::vector<int>::value_type), n + 1, fpin);
+	is_read2 = fread(&col_ind[0], sizeof(std::vector<int>::value_type), nz, fpin);
+
+	double* t_storage = new double[std::max(n, nz)];
+	is_read3 = fread(t_storage, sizeof(double), nz, fpin);
+	for (int i = 0; i < nz; i++)
+			{
+		val[i] = static_cast<float>(t_storage[i]);
+	}
+	delete[] t_storage;
+
+	if (!is_read1 || !is_read2 || !is_read3)
+			{
+		printf("%s", "I/O fail\n");
+		return 1;
+	}
+	return 0;
+}
+
+int read_data_amgx_csr_bin_rhs(FILE* fpin,
+											int n,
+											int nz,
+											std::vector<int> & row_ptr,
+											std::vector<int> & col_ind,
+											std::vector<float>& val,
+											std::vector<float>& rhs
+											)
+											{
+	size_t is_read1, is_read2, is_read3, is_read4;
+	is_read1 = fread(&row_ptr[0], sizeof(std::vector<int>::value_type), n + 1, fpin);
+	is_read2 = fread(&col_ind[0], sizeof(std::vector<int>::value_type), nz, fpin);
+	double* t_storage = new double[std::max(n, nz)];
+	is_read3 = fread(t_storage, sizeof(double), nz, fpin);
+	for (int i = 0; i < nz; i++)
+			{
+		val[i] = static_cast<float>(t_storage[i]);
+	}
+	is_read4 = fread(t_storage, sizeof(double), n, fpin);
+	for (int i = 0; i < n; i++)
+			{
+		rhs[i] = static_cast<float>(t_storage[i]);
+	}
+	delete[] t_storage;
+
+	if (!is_read1 || !is_read2 || !is_read3 || !is_read4)
+			{
+		printf("%s", "I/O fail\n");
+		return 1;
+	}
+	return 0;
+}
+
+//read binary vector from file
+int read_binary_vector(FILE* fpin,
+								int n,
+								std::vector<float>& val
+								)
+								{
+	size_t is_read1;
+
+	double* t_storage = new double[n];
+	is_read1 = fread(t_storage, sizeof(double), n, fpin);
+	for (int i = 0; i < n; i++)
+			{
+		if (t_storage[i] == DBL_MAX)
+			val[i] = FLT_MAX;
+		else if (t_storage[i] == -DBL_MAX)
+			val[i] = -FLT_MAX;
+		else
+			val[i] = static_cast<float>(t_storage[i]);
+	}
+	delete[] t_storage;
+
+	if (is_read1 != (size_t) n)
+			{
+		printf("%s", "I/O fail\n");
+		return 1;
+	}
+	return 0;
+}
+
+int read_binary_vector(FILE* fpin,
+								int n,
+								std::vector<double>& val
+								)
+								{
+	size_t is_read1;
+
+	is_read1 = fread(&val[0], sizeof(double), n, fpin);
+
+	if (is_read1 != (size_t) n)
+			{
+		printf("%s", "I/O fail\n");
+		return 1;
+	}
+	return 0;
+}
+
+int read_binary_vector(FILE* fpin,
+								int n,
+								std::vector<int>& val
+								)
+								{
+	size_t is_read1;
+
+	is_read1 = fread(&val[0], sizeof(int), n, fpin);
+
+	if (is_read1 != (size_t) n)
+			{
+		printf("%s", "I/O fail\n");
+		return 1;
+	}
+	return 0;
+}
+
+//read in as one based
+template<typename IndexType_, typename ValueType_>
+void init_MatrixMarket(IndexType_ base,
+								const char *filename,
+								bool edges_only, //assumes value is 1
+								bool transpose, //parameter to run on A or A'
+								IndexType_ &n,
+								IndexType_ &m,
+								IndexType_ &nnz,
+								std::vector<ValueType_> &csrVal,
+								std::vector<IndexType_> &csrColInd,
+								std::vector<IndexType_> &csrRowInd)
+								{
+	FILE *inputFile = fopen(filename, "r");
+	if (inputFile == NULL)
+	{
+		std::cerr << "ERROR: File path not valid!" << std::endl;
+		exit(EXIT_FAILURE);
+	}
+	std::vector<Mat<IndexType_, ValueType_> > matrix;
+	readMatrixMarketFile<IndexType_, ValueType_>(filename, m, n, nnz,
+																matrix,
+																edges_only);
+
+	Mat<IndexType_, ValueType_> compare(transpose);
+	std::sort(matrix.begin(), matrix.end(), compare);
+	csrVal.resize(nnz);
+	csrColInd.resize(nnz);
+	csrRowInd.resize(nnz);
+	for (int k = 0; k < nnz; ++k)
+			{
+		csrVal[k] = matrix[k].val;
+		csrColInd[k] = (transpose) ? matrix[k].i : matrix[k].j; //doing the transpose
+		csrRowInd[k] = (transpose) ? matrix[k].j : matrix[k].i;
+	}
+	if (base == 0) //always give base 0
+			{
+		for (int i = 0; i < nnz; ++i)
+				{
+			csrColInd[i] -= 1; //get zero based
+			csrRowInd[i] -= 1;
+		}
+	}
+	fclose(inputFile);
+}
+/*template<typename val_t>
+ bool almost_equal (std::vector<val_t> & a, std::vector<val_t> & b, val_t epsilon)
+ {
+ if (a.size() != b.size()) return false;
+ bool passed = true;
+ std::vector<val_t>::iterator itb=b.begin();
+ for (std::vector<val_t>::iterator ita = a.begin() ; ita != a.end(); ++ita)
+ {
+ if (fabs(*ita - *itb) > epsilon)
+ {
+ printf("At ( %ld ) : x1=%lf | x2=%lf\n",ita-a.begin(), *ita,*itb);
+ passed = false;
+ }
+ ++itb;
+ }
+ return passed;
+ }*/
+
diff --git a/cpp/nvgraph/cpp/thirdparty/cnmem b/cpp/nvgraph/cpp/thirdparty/cnmem
new file mode 160000
index 00000000000..37896cc9bfc
--- /dev/null
+++ b/cpp/nvgraph/cpp/thirdparty/cnmem
@@ -0,0 +1 @@
+Subproject commit 37896cc9bfc6536a8c878a1e675835c22d827821
diff --git a/cpp/nvgraph/cpp/thirdparty/cub b/cpp/nvgraph/cpp/thirdparty/cub
new file mode 160000
index 00000000000..c3cceac115c
--- /dev/null
+++ b/cpp/nvgraph/cpp/thirdparty/cub
@@ -0,0 +1 @@
+Subproject commit c3cceac115c072fb63df1836ff46d8c60d9eb304
diff --git a/cpp/nvgraph/external/cub_semiring/agent/agent_histogram.cuh b/cpp/nvgraph/external/cub_semiring/agent/agent_histogram.cuh
new file mode 100644
index 00000000000..3b6cc4c92bc
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/agent/agent_histogram.cuh
@@ -0,0 +1,787 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../util_type.cuh"
+#include "../block/block_load.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy
+ ******************************************************************************/
+
+/**
+ *
+ */
+enum BlockHistogramMemoryPreference
+{
+    GMEM,
+    SMEM,
+    BLEND
+};
+
+
+/**
+ * Parameterizable tuning policy type for AgentHistogram
+ */
+template <
+    int                             _BLOCK_THREADS,                 ///< Threads per thread block
+    int                             _PIXELS_PER_THREAD,             ///< Pixels per thread (per tile of input)
+    BlockLoadAlgorithm              _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier               _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    bool                            _RLE_COMPRESS,                  ///< Whether to perform localized RLE to compress samples before histogramming
+    BlockHistogramMemoryPreference  _MEM_PREFERENCE,                ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
+    bool                            _WORK_STEALING>                 ///< Whether to dequeue tiles from a global work queue
+struct AgentHistogramPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,                   ///< Threads per thread block
+        PIXELS_PER_THREAD       = _PIXELS_PER_THREAD,               ///< Pixels per thread (per tile of input)
+        IS_RLE_COMPRESS         = _RLE_COMPRESS,                    ///< Whether to perform localized RLE to compress samples before histogramming
+        MEM_PREFERENCE          = _MEM_PREFERENCE,                  ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
+        IS_WORK_STEALING        = _WORK_STEALING,                   ///< Whether to dequeue tiles from a global work queue
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
+ */
+template <
+    typename    AgentHistogramPolicyT,     ///< Parameterized AgentHistogramPolicy tuning policy type
+    int         PRIVATIZED_SMEM_BINS,           ///< Number of privatized shared-memory histogram bins of any channel.  Zero indicates privatized counters to be maintained in device-accessible memory.
+    int         NUM_CHANNELS,                   ///< Number of channels interleaved in the input data.  Supports up to four channels.
+    int         NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename    SampleIteratorT,                ///< Random-access input iterator type for reading samples
+    typename    CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
+    typename    PrivatizedDecodeOpT,            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
+    typename    OutputDecodeOpT,                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    int         PTX_ARCH = CUB_PTX_ARCH>        ///< PTX compute capability
+struct AgentHistogram
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The sample type of the input iterator
+    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+    /// The pixel type of SampleT
+    typedef typename CubVector<SampleT, NUM_CHANNELS>::Type PixelT;
+
+    /// The quad type of SampleT
+    typedef typename CubVector<SampleT, 4>::Type QuadT;
+
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS           = AgentHistogramPolicyT::BLOCK_THREADS,
+
+        PIXELS_PER_THREAD       = AgentHistogramPolicyT::PIXELS_PER_THREAD,
+        SAMPLES_PER_THREAD      = PIXELS_PER_THREAD * NUM_CHANNELS,
+        QUADS_PER_THREAD        = SAMPLES_PER_THREAD / 4,
+
+        TILE_PIXELS             = PIXELS_PER_THREAD * BLOCK_THREADS,
+        TILE_SAMPLES            = SAMPLES_PER_THREAD * BLOCK_THREADS,
+
+        IS_RLE_COMPRESS            = AgentHistogramPolicyT::IS_RLE_COMPRESS,
+
+        MEM_PREFERENCE          = (PRIVATIZED_SMEM_BINS > 0) ?
+                                        AgentHistogramPolicyT::MEM_PREFERENCE :
+                                        GMEM,
+
+        IS_WORK_STEALING           = AgentHistogramPolicyT::IS_WORK_STEALING,
+    };
+
+    /// Cache load modifier for reading input elements
+    static const CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER;
+
+
+    /// Input iterator wrapper type (for applying cache modifier)
+    typedef typename If<IsPointer<SampleIteratorT>::VALUE,
+            CacheModifiedInputIterator<LOAD_MODIFIER, SampleT, OffsetT>,     // Wrap the native input pointer with CacheModifiedInputIterator
+            SampleIteratorT>::Type                                           // Directly use the supplied input iterator type
+        WrappedSampleIteratorT;
+
+    /// Pixel input iterator type (for applying cache modifier)
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, PixelT, OffsetT>
+        WrappedPixelIteratorT;
+
+    /// Qaud input iterator type (for applying cache modifier)
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, QuadT, OffsetT>
+        WrappedQuadIteratorT;
+
+    /// Parameterized BlockLoad type for samples
+    typedef BlockLoad<
+            SampleT,
+            BLOCK_THREADS,
+            SAMPLES_PER_THREAD,
+            AgentHistogramPolicyT::LOAD_ALGORITHM>
+        BlockLoadSampleT;
+
+    /// Parameterized BlockLoad type for pixels
+    typedef BlockLoad<
+            PixelT,
+            BLOCK_THREADS,
+            PIXELS_PER_THREAD,
+            AgentHistogramPolicyT::LOAD_ALGORITHM>
+        BlockLoadPixelT;
+
+    /// Parameterized BlockLoad type for quads
+    typedef BlockLoad<
+            QuadT,
+            BLOCK_THREADS,
+            QUADS_PER_THREAD,
+            AgentHistogramPolicyT::LOAD_ALGORITHM>
+        BlockLoadQuadT;
+
+    /// Shared memory type required by this thread block
+    struct _TempStorage
+    {
+        CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1];     // Smem needed for block-privatized smem histogram (with 1 word of padding)
+
+        int tile_idx;
+
+        // Aliasable storage layout
+        union Aliasable
+        {
+            typename BlockLoadSampleT::TempStorage sample_load;     // Smem needed for loading a tile of samples
+            typename BlockLoadPixelT::TempStorage pixel_load;       // Smem needed for loading a tile of pixels
+            typename BlockLoadQuadT::TempStorage quad_load;         // Smem needed for loading a tile of quads
+
+        } aliasable;
+    };
+
+
+    /// Temporary storage type (unionable)
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    /// Reference to temp_storage
+    _TempStorage &temp_storage;
+
+    /// Sample input iterator (with cache modifier applied, if possible)
+    WrappedSampleIteratorT d_wrapped_samples;
+
+    /// Native pointer for input samples (possibly NULL if unavailable)
+    SampleT* d_native_samples;
+
+    /// The number of output bins for each channel
+    int (&num_output_bins)[NUM_ACTIVE_CHANNELS];
+
+    /// The number of privatized bins for each channel
+    int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS];
+
+    /// Reference to gmem privatized histograms for each channel
+    CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS];
+
+    /// Reference to final output histograms (gmem)
+    CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS];
+
+    /// The transform operator for determining output bin-ids from privatized counter indices, one for each channel
+    OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS];
+
+    /// The transform operator for determining privatized counter indices from samples, one for each channel
+    PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS];
+
+    /// Whether to prefer privatized smem counters vs privatized global counters
+    bool prefer_smem;
+
+
+    //---------------------------------------------------------------------
+    // Initialize privatized bin counters
+    //---------------------------------------------------------------------
+
+    // Initialize privatized bin counters
+    __device__ __forceinline__ void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
+    {
+        // Initialize histogram bin counts to zeros
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            for (int privatized_bin = threadIdx.x; privatized_bin < num_privatized_bins[CHANNEL]; privatized_bin += BLOCK_THREADS)
+            {
+                privatized_histograms[CHANNEL][privatized_bin] = 0;
+            }
+        }
+
+        // Barrier to make sure all threads are done updating counters
+        CTA_SYNC();
+    }
+
+
+    // Initialize privatized bin counters.  Specialized for privatized shared-memory counters
+    __device__ __forceinline__ void InitSmemBinCounters()
+    {
+        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
+
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
+
+        InitBinCounters(privatized_histograms);
+    }
+
+
+    // Initialize privatized bin counters.  Specialized for privatized global-memory counters
+    __device__ __forceinline__ void InitGmemBinCounters()
+    {
+        InitBinCounters(d_privatized_histograms);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Update final output histograms
+    //---------------------------------------------------------------------
+
+    // Update final output histograms from privatized histograms
+    __device__ __forceinline__ void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
+    {
+        // Barrier to make sure all threads are done updating counters
+        CTA_SYNC();
+
+        // Apply privatized bin counts to output bin counts
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            int channel_bins = num_privatized_bins[CHANNEL];
+            for (int privatized_bin = threadIdx.x; 
+                    privatized_bin < channel_bins;  
+                    privatized_bin += BLOCK_THREADS)
+            {
+                int         output_bin  = -1;
+                CounterT    count       = privatized_histograms[CHANNEL][privatized_bin];
+                bool        is_valid    = count > 0;
+
+                output_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>((SampleT) privatized_bin, output_bin, is_valid);
+
+                if (output_bin >= 0)
+                {
+                    atomicAdd(&d_output_histograms[CHANNEL][output_bin], count);
+                }
+
+            }
+        }
+    }
+
+
+    // Update final output histograms from privatized histograms.  Specialized for privatized shared-memory counters
+    __device__ __forceinline__ void StoreSmemOutput()
+    {
+        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
+
+        StoreOutput(privatized_histograms);
+    }
+
+
+    // Update final output histograms from privatized histograms.  Specialized for privatized global-memory counters
+    __device__ __forceinline__ void StoreGmemOutput()
+    {
+        StoreOutput(d_privatized_histograms);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Tile accumulation
+    //---------------------------------------------------------------------
+
+    // Accumulate pixels.  Specialized for RLE compression.
+    __device__ __forceinline__ void AccumulatePixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD],
+        CounterT*           privatized_histograms[NUM_ACTIVE_CHANNELS],
+        Int2Type<true>      is_rle_compress)
+    {
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            // Bin pixels
+            int bins[PIXELS_PER_THREAD];
+
+            #pragma unroll
+            for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+            {
+                bins[PIXEL] = -1;
+                privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]);
+            }
+
+            CounterT accumulator = 1;
+
+            #pragma unroll
+            for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL)
+            {
+                if (bins[PIXEL] != bins[PIXEL + 1])
+                {
+                    if (bins[PIXEL] >= 0)
+                        atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator);
+
+                     accumulator = 0;
+                }
+                accumulator++;
+            }
+
+            // Last pixel
+            if (bins[PIXELS_PER_THREAD - 1] >= 0)
+                atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator);
+        }
+    }
+
+
+    // Accumulate pixels.  Specialized for individual accumulation of each pixel.
+    __device__ __forceinline__ void AccumulatePixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD],
+        CounterT*           privatized_histograms[NUM_ACTIVE_CHANNELS],
+        Int2Type<false>     is_rle_compress)
+    {
+        #pragma unroll
+        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+        {
+            #pragma unroll
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            {
+                int bin = -1;
+                privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]);
+                if (bin >= 0)
+                    atomicAdd(privatized_histograms[CHANNEL] + bin, 1);
+            }
+        }
+    }
+
+
+    /**
+     * Accumulate pixel, specialized for smem privatized histogram
+     */
+    __device__ __forceinline__ void AccumulateSmemPixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD])
+    {
+        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
+
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
+
+        AccumulatePixels(samples, is_valid, privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
+    }
+
+
+    /**
+     * Accumulate pixel, specialized for gmem privatized histogram
+     */
+    __device__ __forceinline__ void AccumulateGmemPixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD])
+    {
+        AccumulatePixels(samples, is_valid, d_privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
+    }
+
+
+
+    //---------------------------------------------------------------------
+    // Tile loading
+    //---------------------------------------------------------------------
+
+    // Load full, aligned tile using pixel iterator (multi-channel)
+    template <int _NUM_ACTIVE_CHANNELS>
+    __device__ __forceinline__ void LoadFullAlignedTile(
+        OffsetT                         block_offset,
+        int                             valid_samples,
+        SampleT                         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<_NUM_ACTIVE_CHANNELS>  num_active_channels)
+    {
+        typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
+
+        WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
+
+        // Load using a wrapped pixel iterator
+        BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
+            d_wrapped_pixels,
+            reinterpret_cast<AliasedPixels&>(samples));
+    }
+
+    // Load full, aligned tile using quad iterator (single-channel)
+    __device__ __forceinline__ void LoadFullAlignedTile(
+        OffsetT                         block_offset,
+        int                             valid_samples,
+        SampleT                         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<1>                     num_active_channels)
+    {
+        typedef QuadT AliasedQuads[QUADS_PER_THREAD];
+
+        WrappedQuadIteratorT d_wrapped_quads((QuadT*) (d_native_samples + block_offset));
+
+        // Load using a wrapped quad iterator
+        BlockLoadQuadT(temp_storage.aliasable.quad_load).Load(
+            d_wrapped_quads,
+            reinterpret_cast<AliasedQuads&>(samples));
+    }
+
+    // Load full, aligned tile
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<true>  is_full_tile,
+        Int2Type<true>  is_aligned)
+    {
+        LoadFullAlignedTile(block_offset, valid_samples, samples, Int2Type<NUM_ACTIVE_CHANNELS>());
+    }
+
+    // Load full, mis-aligned tile using sample iterator
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<true>  is_full_tile,
+        Int2Type<false> is_aligned)
+    {
+        typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
+
+        // Load using sample iterator
+        BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
+            d_wrapped_samples + block_offset,
+            reinterpret_cast<AliasedSamples&>(samples));
+    }
+
+    // Load partially-full, aligned tile using the pixel iterator
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<false> is_full_tile,
+        Int2Type<true>  is_aligned)
+    {
+        typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
+
+        WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
+
+        int valid_pixels = valid_samples / NUM_CHANNELS;
+
+        // Load using a wrapped pixel iterator
+        BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
+            d_wrapped_pixels,
+            reinterpret_cast<AliasedPixels&>(samples),
+            valid_pixels);
+    }
+
+    // Load partially-full, mis-aligned tile using sample iterator
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<false> is_full_tile,
+        Int2Type<false> is_aligned)
+    {
+        typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
+
+        BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
+            d_wrapped_samples + block_offset,
+            reinterpret_cast<AliasedSamples&>(samples),
+            valid_samples);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Tile processing
+    //---------------------------------------------------------------------
+
+    // Consume a tile of data samples
+    template <
+        bool IS_ALIGNED,        // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel)
+        bool IS_FULL_TILE>      // Whether the tile is full
+    __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples)
+    {
+        SampleT     samples[PIXELS_PER_THREAD][NUM_CHANNELS];
+        bool        is_valid[PIXELS_PER_THREAD];
+
+        // Load tile
+        LoadTile(
+            block_offset,
+            valid_samples,
+            samples,
+            Int2Type<IS_FULL_TILE>(),
+            Int2Type<IS_ALIGNED>());
+
+        // Set valid flags
+        #pragma unroll
+        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+            is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);
+
+        // Accumulate samples
+#if CUB_PTX_ARCH >= 120
+        if (prefer_smem)
+            AccumulateSmemPixels(samples, is_valid);
+        else
+            AccumulateGmemPixels(samples, is_valid);
+#else
+        AccumulateGmemPixels(samples, is_valid);
+#endif
+
+    }
+
+
+    // Consume row tiles.  Specialized for work-stealing from queue
+    template <bool IS_ALIGNED>
+    __device__ __forceinline__ void ConsumeTiles(
+        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                   ///< The number of rows in the region of interest
+        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
+        int                 tiles_per_row,              ///< Number of image tiles per row
+        GridQueue<int>      tile_queue,
+        Int2Type<true>      is_work_stealing)
+    {
+
+        int         num_tiles                   = num_rows * tiles_per_row;
+        int         tile_idx                    = (blockIdx.y  * gridDim.x) + blockIdx.x;
+        OffsetT     num_even_share_tiles        = gridDim.x * gridDim.y;
+
+        while (tile_idx < num_tiles)
+        {
+            int     row             = tile_idx / tiles_per_row;
+            int     col             = tile_idx - (row * tiles_per_row);
+            OffsetT row_offset      = row * row_stride_samples;
+            OffsetT col_offset      = (col * TILE_SAMPLES);
+            OffsetT tile_offset     = row_offset + col_offset;
+
+            if (col == tiles_per_row - 1)
+            {
+                // Consume a partially-full tile at the end of the row
+                OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset;
+                ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
+            } 
+            else
+            {
+                // Consume full tile
+                ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
+            }
+
+            CTA_SYNC();
+
+            // Get next tile
+            if (threadIdx.x == 0)
+                temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles;
+
+            CTA_SYNC();
+
+            tile_idx = temp_storage.tile_idx;
+        }
+    }
+
+
+    // Consume row tiles.  Specialized for even-share (striped across thread blocks)
+    template <bool IS_ALIGNED>
+    __device__ __forceinline__ void ConsumeTiles(
+        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                   ///< The number of rows in the region of interest
+        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
+        int                 tiles_per_row,              ///< Number of image tiles per row
+        GridQueue<int>      tile_queue,
+        Int2Type<false>     is_work_stealing)
+    {
+        for (int row = blockIdx.y; row < num_rows; row += gridDim.y)
+        {
+            OffsetT row_begin   = row * row_stride_samples;
+            OffsetT row_end     = row_begin + (num_row_pixels * NUM_CHANNELS);
+            OffsetT tile_offset = row_begin + (blockIdx.x * TILE_SAMPLES);
+
+            while (tile_offset < row_end)
+            {
+                OffsetT num_remaining = row_end - tile_offset;
+
+                if (num_remaining < TILE_SAMPLES)
+                {
+                    // Consume partial tile
+                    ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
+                    break;
+                }
+
+                // Consume full tile
+                ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
+                tile_offset += gridDim.x * TILE_SAMPLES;
+            }
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Parameter extraction
+    //---------------------------------------------------------------------
+
+    // Return a native pixel pointer (specialized for CacheModifiedInputIterator types)
+    template <
+        CacheLoadModifier   _MODIFIER,
+        typename            _ValueT,
+        typename            _OffsetT>
+    __device__ __forceinline__ SampleT* NativePointer(CacheModifiedInputIterator<_MODIFIER, _ValueT, _OffsetT> itr)
+    {
+        return itr.ptr;
+    }
+
+    // Return a native pixel pointer (specialized for other types)
+    template <typename IteratorT>
+    __device__ __forceinline__ SampleT* NativePointer(IteratorT itr)
+    {
+        return NULL;
+    }
+
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentHistogram(
+        TempStorage         &temp_storage,                                      ///< Reference to temp_storage
+        SampleIteratorT     d_samples,                                          ///< Input data to reduce
+        int                 (&num_output_bins)[NUM_ACTIVE_CHANNELS],            ///< The number bins per final output histogram
+        int                 (&num_privatized_bins)[NUM_ACTIVE_CHANNELS],        ///< The number bins per privatized histogram
+        CounterT*           (&d_output_histograms)[NUM_ACTIVE_CHANNELS],        ///< Reference to final output histograms
+        CounterT*           (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS],    ///< Reference to privatized histograms
+        OutputDecodeOpT     (&output_decode_op)[NUM_ACTIVE_CHANNELS],           ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel
+        PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS])       ///< The transform operator for determining privatized counter indices from samples, one for each channel
+    :
+        temp_storage(temp_storage.Alias()),
+        d_wrapped_samples(d_samples),
+        num_output_bins(num_output_bins),
+        num_privatized_bins(num_privatized_bins),
+        d_output_histograms(d_output_histograms),
+        privatized_decode_op(privatized_decode_op),
+        output_decode_op(output_decode_op),
+        d_native_samples(NativePointer(d_wrapped_samples)),
+        prefer_smem((MEM_PREFERENCE == SMEM) ?
+            true :                              // prefer smem privatized histograms
+            (MEM_PREFERENCE == GMEM) ?
+                false :                         // prefer gmem privatized histograms
+                blockIdx.x & 1)                 // prefer blended privatized histograms
+    {
+        int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
+
+        // Initialize the locations of this block's privatized histograms
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            this->d_privatized_histograms[CHANNEL] = d_privatized_histograms[CHANNEL] + (blockId * num_privatized_bins[CHANNEL]);
+    }
+
+
+    /**
+     * Consume image
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                   ///< The number of rows in the region of interest
+        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
+        int                 tiles_per_row,              ///< Number of image tiles per row
+        GridQueue<int>      tile_queue)                 ///< Queue descriptor for assigning tiles of work to thread blocks
+    {
+        // Check whether all row starting offsets are quad-aligned (in single-channel) or pixel-aligned (in multi-channel)
+        int     quad_mask           = AlignBytes<QuadT>::ALIGN_BYTES - 1;
+        int     pixel_mask          = AlignBytes<PixelT>::ALIGN_BYTES - 1;
+        size_t  row_bytes           = sizeof(SampleT) * row_stride_samples;
+
+        bool quad_aligned_rows      = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) &&     // Single channel
+                                        ((size_t(d_native_samples) & quad_mask) == 0) &&        // ptr is quad-aligned
+                                        ((num_rows == 1) || ((row_bytes & quad_mask) == 0));    // number of row-samples is a multiple of the alignment of the quad
+
+        bool pixel_aligned_rows     = (NUM_CHANNELS > 1) &&                                     // Multi channel
+                                        ((size_t(d_native_samples) & pixel_mask) == 0) &&       // ptr is pixel-aligned
+                                        ((row_bytes & pixel_mask) == 0);                        // number of row-samples is a multiple of the alignment of the pixel
+
+        // Whether rows are aligned and can be vectorized
+        if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows))
+            ConsumeTiles<true>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
+        else
+            ConsumeTiles<false>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
+    }
+
+
+    /**
+     * Initialize privatized bin counters.  Specialized for privatized shared-memory counters
+     */
+    __device__ __forceinline__ void InitBinCounters()
+    {
+        if (prefer_smem)
+            InitSmemBinCounters();
+        else
+            InitGmemBinCounters();
+    }
+
+
+    /**
+     * Store privatized histogram to device-accessible memory.  Specialized for privatized shared-memory counters
+     */
+    __device__ __forceinline__ void StoreOutput()
+    {
+        if (prefer_smem)
+            StoreSmemOutput();
+        else
+            StoreGmemOutput();
+    }
+
+
+};
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/agent/agent_radix_sort_downsweep.cuh b/cpp/nvgraph/external/cub_semiring/agent/agent_radix_sort_downsweep.cuh
new file mode 100644
index 00000000000..0eee5f4ebf1
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/agent/agent_radix_sort_downsweep.cuh
@@ -0,0 +1,772 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep .
+ */
+
+
+#pragma once
+
+#include <stdint.h>
+
+#include "../thread/thread_load.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_radix_rank.cuh"
+#include "../block/block_exchange.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Radix ranking algorithm
+ */
+enum RadixRankAlgorithm
+{
+    RADIX_RANK_BASIC,
+    RADIX_RANK_MEMOIZE,
+    RADIX_RANK_MATCH
+};
+
+/**
+ * Parameterizable tuning policy type for AgentRadixSortDownsweep
+ */
+template <
+    int                         _BLOCK_THREADS,         ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,      ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,        ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,         ///< Cache load modifier for reading keys (and values)
+    RadixRankAlgorithm          _RANK_ALGORITHM,        ///< The radix ranking algorithm to use
+    BlockScanAlgorithm          _SCAN_ALGORITHM,        ///< The block scan algorithm to use
+    int                         _RADIX_BITS>            ///< The number of radix bits, i.e., log2(bins)
+struct AgentRadixSortDownsweepPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,           ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,        ///< Items per thread (per tile of input)
+        RADIX_BITS              = _RADIX_BITS,              ///< The number of radix bits, i.e., log2(bins)
+    };
+
+    static const BlockLoadAlgorithm  LOAD_ALGORITHM     = _LOAD_ALGORITHM;    ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier   LOAD_MODIFIER      = _LOAD_MODIFIER;     ///< Cache load modifier for reading keys (and values)
+    static const RadixRankAlgorithm  RANK_ALGORITHM     = _RANK_ALGORITHM;    ///< The radix ranking algorithm to use
+    static const BlockScanAlgorithm  SCAN_ALGORITHM     = _SCAN_ALGORITHM;    ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+
+
+
+
+/**
+ * \brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep .
+ */
+template <
+    typename AgentRadixSortDownsweepPolicy,     ///< Parameterized AgentRadixSortDownsweepPolicy tuning policy type
+    bool     IS_DESCENDING,                     ///< Whether or not the sorted-order is high-to-low
+    typename KeyT,                              ///< KeyT type
+    typename ValueT,                            ///< ValueT type
+    typename OffsetT>                           ///< Signed integer type for global offsets
+struct AgentRadixSortDownsweep
+{
+    //---------------------------------------------------------------------
+    // Type definitions and constants
+    //---------------------------------------------------------------------
+
+    // Appropriate unsigned-bits representation of KeyT
+    typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
+
+    static const UnsignedBits           LOWEST_KEY  = Traits<KeyT>::LOWEST_KEY;
+    static const UnsignedBits           MAX_KEY     = Traits<KeyT>::MAX_KEY;
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM  = AgentRadixSortDownsweepPolicy::LOAD_ALGORITHM;
+    static const CacheLoadModifier      LOAD_MODIFIER   = AgentRadixSortDownsweepPolicy::LOAD_MODIFIER;
+    static const RadixRankAlgorithm     RANK_ALGORITHM  = AgentRadixSortDownsweepPolicy::RANK_ALGORITHM;
+    static const BlockScanAlgorithm     SCAN_ALGORITHM  = AgentRadixSortDownsweepPolicy::SCAN_ALGORITHM;
+
+    enum
+    {
+        BLOCK_THREADS           = AgentRadixSortDownsweepPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentRadixSortDownsweepPolicy::ITEMS_PER_THREAD,
+        RADIX_BITS              = AgentRadixSortDownsweepPolicy::RADIX_BITS,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        RADIX_DIGITS            = 1 << RADIX_BITS,
+        KEYS_ONLY               = Equals<ValueT, NullType>::VALUE,
+    };
+
+    // Input iterator wrapper type (for applying cache modifier)s
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, OffsetT>    KeysItr;
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, ValueT, OffsetT>          ValuesItr;
+
+    // Radix ranking type to use
+    typedef typename If<(RANK_ALGORITHM == RADIX_RANK_BASIC),
+            BlockRadixRank<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, false, SCAN_ALGORITHM>,
+            typename If<(RANK_ALGORITHM == RADIX_RANK_MEMOIZE),
+                BlockRadixRank<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, true, SCAN_ALGORITHM>,
+                BlockRadixRankMatch<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, SCAN_ALGORITHM>
+            >::Type
+        >::Type BlockRadixRankT;
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = BlockRadixRankT::BINS_TRACKED_PER_THREAD
+    };
+
+    // BlockLoad type (keys)
+    typedef BlockLoad<
+        UnsignedBits,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        LOAD_ALGORITHM> BlockLoadKeysT;
+
+    // BlockLoad type (values)
+    typedef BlockLoad<
+        ValueT,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        LOAD_ALGORITHM> BlockLoadValuesT;
+
+    // Value exchange array type
+    typedef ValueT ValueExchangeT[TILE_ITEMS];
+
+    /**
+     * Shared memory storage layout
+     */
+    union __align__(16) _TempStorage
+    {
+        typename BlockLoadKeysT::TempStorage    load_keys;
+        typename BlockLoadValuesT::TempStorage  load_values;
+        typename BlockRadixRankT::TempStorage   radix_rank;
+
+        struct
+        {
+            UnsignedBits                        exchange_keys[TILE_ITEMS];
+            OffsetT                             relative_bin_offsets[RADIX_DIGITS];
+        };
+
+        Uninitialized<ValueExchangeT>           exchange_values;
+
+        OffsetT                                 exclusive_digit_prefix[RADIX_DIGITS];
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    // Shared storage for this CTA
+    _TempStorage    &temp_storage;
+
+    // Input and output device pointers
+    KeysItr         d_keys_in;
+    ValuesItr       d_values_in;
+    UnsignedBits    *d_keys_out;
+    ValueT          *d_values_out;
+
+    // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads)
+    OffsetT         bin_offset[BINS_TRACKED_PER_THREAD];
+
+    // The least-significant bit position of the current digit to extract
+    int             current_bit;
+
+    // Number of bits in current digit
+    int             num_bits;
+
+    // Whether to short-cirucit
+    int             short_circuit;
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Scatter ranked keys through shared memory, then to device-accessible memory
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ScatterKeys(
+        UnsignedBits    (&twiddled_keys)[ITEMS_PER_THREAD],
+        OffsetT         (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        OffsetT         valid_items)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            temp_storage.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            UnsignedBits key            = temp_storage.exchange_keys[threadIdx.x + (ITEM * BLOCK_THREADS)];
+            UnsignedBits digit          = BFE(key, current_bit, num_bits);
+            relative_bin_offsets[ITEM]  = temp_storage.relative_bin_offsets[digit];
+
+            // Un-twiddle
+            key = Traits<KeyT>::TwiddleOut(key);
+
+            if (FULL_TILE || 
+                (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
+            {
+                d_keys_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = key;
+            }
+        }
+    }
+
+
+    /**
+     * Scatter ranked values through shared memory, then to device-accessible memory
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ScatterValues(
+        ValueT      (&values)[ITEMS_PER_THREAD],
+        OffsetT     (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int         (&ranks)[ITEMS_PER_THREAD],
+        OffsetT     valid_items)
+    {
+        CTA_SYNC();
+
+        ValueExchangeT &exchange_values = temp_storage.exchange_values.Alias();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            exchange_values[ranks[ITEM]] = values[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            ValueT value = exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)];
+
+            if (FULL_TILE || 
+                (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
+            {
+                d_values_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = value;
+            }
+        }
+    }
+
+    /**
+     * Load a tile of keys (specialized for full tile, any ranking algorithm)
+     */
+    template <int _RANK_ALGORITHM>
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type<true>              is_full_tile,
+        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+    {
+        BlockLoadKeysT(temp_storage.load_keys).Load(
+            d_keys_in + block_offset, keys);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of keys (specialized for partial tile, any ranking algorithm)
+     */
+    template <int _RANK_ALGORITHM>
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type<false>             is_full_tile,
+        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+    {
+        BlockLoadKeysT(temp_storage.load_keys).Load(
+            d_keys_in + block_offset, keys, valid_items, oob_item);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of keys (specialized for full tile, match ranking algorithm)
+     */
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type<true>              is_full_tile,
+        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
+    {
+        LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys);
+    }
+
+
+    /**
+     * Load a tile of keys (specialized for partial tile, match ranking algorithm)
+     */
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type<false>             is_full_tile,
+        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
+    {
+        LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys, valid_items, oob_item);
+    }
+
+
+    /**
+     * Load a tile of values (specialized for full tile, any ranking algorithm)
+     */
+    template <int _RANK_ALGORITHM>
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        Int2Type<true>              is_full_tile,
+        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+    {
+        BlockLoadValuesT(temp_storage.load_values).Load(
+            d_values_in + block_offset, values);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of values (specialized for partial tile, any ranking algorithm)
+     */
+    template <int _RANK_ALGORITHM>
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        Int2Type<false>             is_full_tile,
+        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+    {
+        BlockLoadValuesT(temp_storage.load_values).Load(
+            d_values_in + block_offset, values, valid_items);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of items (specialized for full tile, match ranking algorithm)
+     */
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        volatile OffsetT                     valid_items,
+        Int2Type<true>              is_full_tile,
+        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
+    {
+        LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values);
+    }
+
+
+    /**
+     * Load a tile of items (specialized for partial tile, match ranking algorithm)
+     */
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        volatile OffsetT                     valid_items,
+        Int2Type<false>             is_full_tile,
+        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
+    {
+        LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values, valid_items);
+    }
+
+
+    /**
+     * Truck along associated values
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void GatherScatterValues(
+        OffsetT         (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        OffsetT         block_offset,
+        OffsetT         valid_items,
+        Int2Type<false> /*is_keys_only*/)
+    {
+        CTA_SYNC();
+
+        ValueT values[ITEMS_PER_THREAD];
+
+        LoadValues(
+            values,
+            block_offset,
+            valid_items,
+            Int2Type<FULL_TILE>(),
+            Int2Type<RANK_ALGORITHM>());
+
+        ScatterValues<FULL_TILE>(
+            values,
+            relative_bin_offsets,
+            ranks,
+            valid_items);
+    }
+
+
+    /**
+     * Truck along associated values (specialized for key-only sorting)
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void GatherScatterValues(
+        OffsetT         (&/*relative_bin_offsets*/)[ITEMS_PER_THREAD],
+        int             (&/*ranks*/)[ITEMS_PER_THREAD],
+        OffsetT         /*block_offset*/,
+        OffsetT         /*valid_items*/,
+        Int2Type<true>  /*is_keys_only*/)
+    {}
+
+
+    /**
+     * Process tile
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ProcessTile(
+        OffsetT block_offset,
+        const OffsetT &valid_items = TILE_ITEMS)
+    {
+        UnsignedBits    keys[ITEMS_PER_THREAD];
+        int             ranks[ITEMS_PER_THREAD];
+        OffsetT         relative_bin_offsets[ITEMS_PER_THREAD];
+
+        // Assign default (min/max) value to all keys
+        UnsignedBits default_key = (IS_DESCENDING) ? LOWEST_KEY : MAX_KEY;
+
+        // Load tile of keys
+        LoadKeys(
+            keys,
+            block_offset,
+            valid_items, 
+            default_key,
+            Int2Type<FULL_TILE>(),
+            Int2Type<RANK_ALGORITHM>());
+
+        // Twiddle key bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            keys[KEY] = Traits<KeyT>::TwiddleIn(keys[KEY]);
+        }
+
+        // Rank the twiddled keys
+        int exclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
+        BlockRadixRankT(temp_storage.radix_rank).RankKeys(
+            keys,
+            ranks,
+            current_bit,
+            num_bits,
+            exclusive_digit_prefix);
+
+        CTA_SYNC();
+
+        // Share exclusive digit prefix
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                // Store exclusive prefix
+                temp_storage.exclusive_digit_prefix[bin_idx] =
+                    exclusive_digit_prefix[track];
+            }
+        }
+
+        CTA_SYNC();
+
+        // Get inclusive digit prefix
+        int inclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
+
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                {
+                    // Get inclusive digit prefix from exclusive prefix (higher bins come first)
+                    inclusive_digit_prefix[track] = (bin_idx == 0) ?
+                        (BLOCK_THREADS * ITEMS_PER_THREAD) :
+                        temp_storage.exclusive_digit_prefix[bin_idx - 1];
+                }
+                else
+                {
+                    // Get inclusive digit prefix from exclusive prefix (lower bins come first)
+                    inclusive_digit_prefix[track] = (bin_idx == RADIX_DIGITS - 1) ?
+                        (BLOCK_THREADS * ITEMS_PER_THREAD) :
+                        temp_storage.exclusive_digit_prefix[bin_idx + 1];
+                }
+            }
+        }
+
+        CTA_SYNC();
+
+        // Update global scatter base offsets for each digit
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                bin_offset[track] -= exclusive_digit_prefix[track];
+                temp_storage.relative_bin_offsets[bin_idx] = bin_offset[track];
+                bin_offset[track] += inclusive_digit_prefix[track];
+            }
+        }
+
+        CTA_SYNC();
+
+        // Scatter keys
+        ScatterKeys<FULL_TILE>(keys, relative_bin_offsets, ranks, valid_items);
+
+        // Gather/scatter values
+        GatherScatterValues<FULL_TILE>(relative_bin_offsets , ranks, block_offset, valid_items, Int2Type<KEYS_ONLY>());
+    }
+
+    //---------------------------------------------------------------------
+    // Copy shortcut
+    //---------------------------------------------------------------------
+
+    /**
+     * Copy tiles within the range of input
+     */
+    template <
+        typename InputIteratorT,
+        typename T>
+    __device__ __forceinline__ void Copy(
+        InputIteratorT  d_in,
+        T               *d_out,
+        OffsetT         block_offset,
+        OffsetT         block_end)
+    {
+        // Simply copy the input
+        while (block_offset + TILE_ITEMS <= block_end)
+        {
+            T items[ITEMS_PER_THREAD];
+
+            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
+            CTA_SYNC();
+            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
+
+            block_offset += TILE_ITEMS;
+        }
+
+        // Clean up last partial tile with guarded-I/O
+        if (block_offset < block_end)
+        {
+            OffsetT valid_items = block_end - block_offset;
+
+            T items[ITEMS_PER_THREAD];
+
+            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items, valid_items);
+            CTA_SYNC();
+            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items, valid_items);
+        }
+    }
+
+
+    /**
+     * Copy tiles within the range of input (specialized for NullType)
+     */
+    template <typename InputIteratorT>
+    __device__ __forceinline__ void Copy(
+        InputIteratorT  /*d_in*/,
+        NullType        * /*d_out*/,
+        OffsetT         /*block_offset*/,
+        OffsetT         /*block_end*/)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentRadixSortDownsweep(
+        TempStorage     &temp_storage,
+        OffsetT         (&bin_offset)[BINS_TRACKED_PER_THREAD],
+        OffsetT         num_items,
+        const KeyT      *d_keys_in,
+        KeyT            *d_keys_out,
+        const ValueT    *d_values_in,
+        ValueT          *d_values_out,
+        int             current_bit,
+        int             num_bits)
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
+        d_values_in(d_values_in),
+        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
+        d_values_out(d_values_out),
+        current_bit(current_bit),
+        num_bits(num_bits),
+        short_circuit(1)
+    {
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            this->bin_offset[track] = bin_offset[track];
+
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                // Short circuit if the histogram has only bin counts of only zeros or problem-size
+                short_circuit = short_circuit && ((bin_offset[track] == 0) || (bin_offset[track] == num_items));
+            }
+        }
+
+        short_circuit = CTA_SYNC_AND(short_circuit);
+    }
+
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentRadixSortDownsweep(
+        TempStorage     &temp_storage,
+        OffsetT         num_items,
+        OffsetT         *d_spine,
+        const KeyT      *d_keys_in,
+        KeyT            *d_keys_out,
+        const ValueT    *d_values_in,
+        ValueT          *d_values_out,
+        int             current_bit,
+        int             num_bits)
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
+        d_values_in(d_values_in),
+        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
+        d_values_out(d_values_out),
+        current_bit(current_bit),
+        num_bits(num_bits),
+        short_circuit(1)
+    {
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit)
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                    bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+                // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size
+                OffsetT first_block_bin_offset = d_spine[gridDim.x * bin_idx];
+                short_circuit = short_circuit && ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items));
+
+                // Load my block's bin offset for my bin
+                bin_offset[track] = d_spine[(gridDim.x * bin_idx) + blockIdx.x];
+            }
+        }
+
+        short_circuit = CTA_SYNC_AND(short_circuit);
+    }
+
+
+    /**
+     * Distribute keys from a segment of input tiles.
+     */
+    __device__ __forceinline__ void ProcessRegion(
+        OffsetT   block_offset,
+        OffsetT   block_end)
+    {
+        if (short_circuit)
+        {
+            // Copy keys
+            Copy(d_keys_in, d_keys_out, block_offset, block_end);
+
+            // Copy values
+            Copy(d_values_in, d_values_out, block_offset, block_end);
+        }
+        else
+        {
+            // Process full tiles of tile_items
+            while (block_offset + TILE_ITEMS <= block_end)
+            {
+                ProcessTile<true>(block_offset);
+                block_offset += TILE_ITEMS;
+
+                CTA_SYNC();
+            }
+
+            // Clean up last partial tile with guarded-I/O
+            if (block_offset < block_end)
+            {
+                ProcessTile<false>(block_offset, block_end - block_offset);
+            }
+
+        }
+    }
+
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/agent/agent_radix_sort_upsweep.cuh b/cpp/nvgraph/external/cub_semiring/agent/agent_radix_sort_upsweep.cuh
new file mode 100644
index 00000000000..803fadf2486
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/agent/agent_radix_sort_upsweep.cuh
@@ -0,0 +1,526 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
+ */
+
+#pragma once
+
+#include "../thread/thread_reduce.cuh"
+#include "../thread/thread_load.cuh"
+#include "../warp/warp_reduce.cuh"
+#include "../block/block_load.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentRadixSortUpsweep
+ */
+template <
+    int                 _BLOCK_THREADS,     ///< Threads per thread block
+    int                 _ITEMS_PER_THREAD,  ///< Items per thread (per tile of input)
+    CacheLoadModifier   _LOAD_MODIFIER,     ///< Cache load modifier for reading keys
+    int                 _RADIX_BITS>        ///< The number of radix bits, i.e., log2(bins)
+struct AgentRadixSortUpsweepPolicy
+{
+    enum
+    {
+        BLOCK_THREADS       = _BLOCK_THREADS,       ///< Threads per thread block
+        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,    ///< Items per thread (per tile of input)
+        RADIX_BITS          = _RADIX_BITS,          ///< The number of radix bits, i.e., log2(bins)
+    };
+
+    static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;      ///< Cache load modifier for reading keys
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
+ */
+template <
+    typename AgentRadixSortUpsweepPolicy,   ///< Parameterized AgentRadixSortUpsweepPolicy tuning policy type
+    typename KeyT,                          ///< KeyT type
+    typename OffsetT>                       ///< Signed integer type for global offsets
+struct AgentRadixSortUpsweep
+{
+
+    //---------------------------------------------------------------------
+    // Type definitions and constants
+    //---------------------------------------------------------------------
+
+    typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
+
+    // Integer type for digit counters (to be packed into words of PackedCounters)
+    typedef unsigned char DigitCounter;
+
+    // Integer type for packing DigitCounters into columns of shared memory banks
+    typedef unsigned int PackedCounter;
+
+    static const CacheLoadModifier LOAD_MODIFIER = AgentRadixSortUpsweepPolicy::LOAD_MODIFIER;
+
+    enum
+    {
+        RADIX_BITS              = AgentRadixSortUpsweepPolicy::RADIX_BITS,
+        BLOCK_THREADS           = AgentRadixSortUpsweepPolicy::BLOCK_THREADS,
+        KEYS_PER_THREAD         = AgentRadixSortUpsweepPolicy::ITEMS_PER_THREAD,
+
+        RADIX_DIGITS            = 1 << RADIX_BITS,
+
+        LOG_WARP_THREADS        = CUB_PTX_LOG_WARP_THREADS,
+        WARP_THREADS            = 1 << LOG_WARP_THREADS,
+        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        TILE_ITEMS              = BLOCK_THREADS * KEYS_PER_THREAD,
+
+        BYTES_PER_COUNTER       = sizeof(DigitCounter),
+        LOG_BYTES_PER_COUNTER   = Log2<BYTES_PER_COUNTER>::VALUE,
+
+        PACKING_RATIO           = sizeof(PackedCounter) / sizeof(DigitCounter),
+        LOG_PACKING_RATIO       = Log2<PACKING_RATIO>::VALUE,
+
+        LOG_COUNTER_LANES       = CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO),
+        COUNTER_LANES           = 1 << LOG_COUNTER_LANES,
+
+        // To prevent counter overflow, we must periodically unpack and aggregate the
+        // digit counters back into registers.  Each counter lane is assigned to a
+        // warp for aggregation.
+
+        LANES_PER_WARP          = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS),
+
+        // Unroll tiles in batches without risk of counter overflow
+        UNROLL_COUNT            = CUB_MIN(64, 255 / KEYS_PER_THREAD),
+        UNROLLED_ELEMENTS       = UNROLL_COUNT * TILE_ITEMS,
+    };
+
+
+    // Input iterator wrapper type (for applying cache modifier)s
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, OffsetT> KeysItr;
+
+    /**
+     * Shared memory storage layout
+     */
+    union __align__(16) _TempStorage
+    {
+        DigitCounter    thread_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
+        PackedCounter   packed_thread_counters[COUNTER_LANES][BLOCK_THREADS];
+        OffsetT         block_counters[WARP_THREADS][RADIX_DIGITS];
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields (aggregate state bundle)
+    //---------------------------------------------------------------------
+
+    // Shared storage for this CTA
+    _TempStorage    &temp_storage;
+
+    // Thread-local counters for periodically aggregating composite-counter lanes
+    OffsetT         local_counts[LANES_PER_WARP][PACKING_RATIO];
+
+    // Input and output device pointers
+    KeysItr         d_keys_in;
+
+    // The least-significant bit position of the current digit to extract
+    int             current_bit;
+
+    // Number of bits in current digit
+    int             num_bits;
+
+
+
+    //---------------------------------------------------------------------
+    // Helper structure for templated iteration
+    //---------------------------------------------------------------------
+
+    // Iterate
+    template <int COUNT, int MAX>
+    struct Iterate
+    {
+        // BucketKeys
+        static __device__ __forceinline__ void BucketKeys(
+            AgentRadixSortUpsweep       &cta,
+            UnsignedBits                keys[KEYS_PER_THREAD])
+        {
+            cta.Bucket(keys[COUNT]);
+
+            // Next
+            Iterate<COUNT + 1, MAX>::BucketKeys(cta, keys);
+        }
+    };
+
+    // Terminate
+    template <int MAX>
+    struct Iterate<MAX, MAX>
+    {
+        // BucketKeys
+        static __device__ __forceinline__ void BucketKeys(AgentRadixSortUpsweep &/*cta*/, UnsignedBits /*keys*/[KEYS_PER_THREAD]) {}
+    };
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Decode a key and increment corresponding smem digit counter
+     */
+    __device__ __forceinline__ void Bucket(UnsignedBits key)
+    {
+        // Perform transform op
+        UnsignedBits converted_key = Traits<KeyT>::TwiddleIn(key);
+
+        // Extract current digit bits
+        UnsignedBits digit = BFE(converted_key, current_bit, num_bits);
+
+        // Get sub-counter offset
+        UnsignedBits sub_counter = digit & (PACKING_RATIO - 1);
+
+        // Get row offset
+        UnsignedBits row_offset = digit >> LOG_PACKING_RATIO;
+
+        // Increment counter
+        temp_storage.thread_counters[row_offset][threadIdx.x][sub_counter]++;
+    }
+
+
+    /**
+     * Reset composite counters
+     */
+    __device__ __forceinline__ void ResetDigitCounters()
+    {
+        #pragma unroll
+        for (int LANE = 0; LANE < COUNTER_LANES; LANE++)
+        {
+            temp_storage.packed_thread_counters[LANE][threadIdx.x] = 0;
+        }
+    }
+
+
+    /**
+     * Reset the unpacked counters in each thread
+     */
+    __device__ __forceinline__ void ResetUnpackedCounters()
+    {
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            #pragma unroll
+            for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+            {
+                local_counts[LANE][UNPACKED_COUNTER] = 0;
+            }
+        }
+    }
+
+
+    /**
+     * Extracts and aggregates the digit counters for each counter lane
+     * owned by this warp
+     */
+    __device__ __forceinline__ void UnpackDigitCounts()
+    {
+        unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid = LaneId();
+
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            const int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                #pragma unroll
+                for (int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS)
+                {
+                    #pragma unroll
+                    for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                    {
+                        OffsetT counter = temp_storage.thread_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER];
+                        local_counts[LANE][UNPACKED_COUNTER] += counter;
+                    }
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Processes a single, full tile
+     */
+    __device__ __forceinline__ void ProcessFullTile(OffsetT block_offset)
+    {
+        // Tile of keys
+        UnsignedBits keys[KEYS_PER_THREAD];
+
+        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys_in + block_offset, keys);
+
+        // Prevent hoisting
+        CTA_SYNC();
+
+        // Bucket tile of keys
+        Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys);
+    }
+
+
+    /**
+     * Processes a single load (may have some threads masked off)
+     */
+    __device__ __forceinline__ void ProcessPartialTile(
+        OffsetT block_offset,
+        const OffsetT &block_end)
+    {
+        // Process partial tile if necessary using single loads
+        block_offset += threadIdx.x;
+        while (block_offset < block_end)
+        {
+            // Load and bucket key
+            UnsignedBits key = d_keys_in[block_offset];
+            Bucket(key);
+            block_offset += BLOCK_THREADS;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentRadixSortUpsweep(
+        TempStorage &temp_storage,
+        const KeyT  *d_keys_in,
+        int         current_bit,
+        int         num_bits)
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
+        current_bit(current_bit),
+        num_bits(num_bits)
+    {}
+
+
+    /**
+     * Compute radix digit histograms from a segment of input tiles.
+     */
+    __device__ __forceinline__ void ProcessRegion(
+        OffsetT          block_offset,
+        const OffsetT    &block_end)
+    {
+        // Reset digit counters in smem and unpacked counters in registers
+        ResetDigitCounters();
+        ResetUnpackedCounters();
+
+        // Unroll batches of full tiles
+        while (block_offset + UNROLLED_ELEMENTS <= block_end)
+        {
+            for (int i = 0; i < UNROLL_COUNT; ++i)
+            {
+                ProcessFullTile(block_offset);
+                block_offset += TILE_ITEMS;
+            }
+
+            CTA_SYNC();
+
+            // Aggregate back into local_count registers to prevent overflow
+            UnpackDigitCounts();
+
+            CTA_SYNC();
+
+            // Reset composite counters in lanes
+            ResetDigitCounters();
+        }
+
+        // Unroll single full tiles
+        while (block_offset + TILE_ITEMS <= block_end)
+        {
+            ProcessFullTile(block_offset);
+            block_offset += TILE_ITEMS;
+        }
+
+        // Process partial tile if necessary
+        ProcessPartialTile(
+            block_offset,
+            block_end);
+
+        CTA_SYNC();
+
+        // Aggregate back into local_count registers
+        UnpackDigitCounts();
+    }
+
+
+    /**
+     * Extract counts (saving them to the external array)
+     */
+    template <bool IS_DESCENDING>
+    __device__ __forceinline__ void ExtractCounts(
+        OffsetT     *counters,
+        int         bin_stride = 1,
+        int         bin_offset = 0)
+    {
+        unsigned int warp_id    = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid   = LaneId();
+
+        // Place unpacked digit counters in shared memory
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                int digit_row = counter_lane << LOG_PACKING_RATIO;
+
+                #pragma unroll
+                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                {
+                    int bin_idx = digit_row + UNPACKED_COUNTER;
+
+                    temp_storage.block_counters[warp_tid][bin_idx] =
+                        local_counts[LANE][UNPACKED_COUNTER];
+                }
+            }
+        }
+
+        CTA_SYNC();
+
+        // Rake-reduce bin_count reductions
+
+        // Whole blocks
+        #pragma unroll
+        for (int BIN_BASE   = RADIX_DIGITS % BLOCK_THREADS;
+            (BIN_BASE + BLOCK_THREADS) <= RADIX_DIGITS;
+            BIN_BASE += BLOCK_THREADS)
+        {
+            int bin_idx = BIN_BASE + threadIdx.x;
+
+            OffsetT bin_count = 0;
+            #pragma unroll
+            for (int i = 0; i < WARP_THREADS; ++i)
+                bin_count += temp_storage.block_counters[i][bin_idx];
+
+            if (IS_DESCENDING)
+                bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+            counters[(bin_stride * bin_idx) + bin_offset] = bin_count;
+        }
+
+        // Remainder
+        if ((RADIX_DIGITS % BLOCK_THREADS != 0) && (threadIdx.x < RADIX_DIGITS))
+        {
+            int bin_idx = threadIdx.x;
+
+            OffsetT bin_count = 0;
+            #pragma unroll
+            for (int i = 0; i < WARP_THREADS; ++i)
+                bin_count += temp_storage.block_counters[i][bin_idx];
+
+            if (IS_DESCENDING)
+                bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+            counters[(bin_stride * bin_idx) + bin_offset] = bin_count;
+        }
+    }
+
+
+    /**
+     * Extract counts
+     */
+    template <int BINS_TRACKED_PER_THREAD>
+    __device__ __forceinline__ void ExtractCounts(
+        OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD])  ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+    {
+        unsigned int warp_id    = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid   = LaneId();
+
+        // Place unpacked digit counters in shared memory
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                int digit_row = counter_lane << LOG_PACKING_RATIO;
+
+                #pragma unroll
+                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                {
+                    int bin_idx = digit_row + UNPACKED_COUNTER;
+
+                    temp_storage.block_counters[warp_tid][bin_idx] =
+                        local_counts[LANE][UNPACKED_COUNTER];
+                }
+            }
+        }
+
+        CTA_SYNC();
+
+        // Rake-reduce bin_count reductions
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                bin_count[track] = 0;
+
+                #pragma unroll
+                for (int i = 0; i < WARP_THREADS; ++i)
+                    bin_count[track] += temp_storage.block_counters[i][bin_idx];
+            }
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/agent/agent_reduce.cuh b/cpp/nvgraph/external/cub_semiring/agent/agent_reduce.cuh
new file mode 100644
index 00000000000..5528d8bdd64
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/agent/agent_reduce.cuh
@@ -0,0 +1,385 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../block/block_load.cuh"
+#include "../block/block_reduce.cuh"
+#include "../grid/grid_mapping.cuh"
+#include "../grid/grid_even_share.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentReduce
+ */
+template <
+    int                     _BLOCK_THREADS,         ///< Threads per thread block
+    int                     _ITEMS_PER_THREAD,      ///< Items per thread (per tile of input)
+    int                     _VECTOR_LOAD_LENGTH,    ///< Number of items per vectorized load
+    BlockReduceAlgorithm    _BLOCK_ALGORITHM,       ///< Cooperative block-wide reduction algorithm to use
+    CacheLoadModifier       _LOAD_MODIFIER>         ///< Cache load modifier for reading input elements
+struct AgentReducePolicy
+{
+    enum
+    {
+        BLOCK_THREADS       = _BLOCK_THREADS,       ///< Threads per thread block
+        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,    ///< Items per thread (per tile of input)
+        VECTOR_LOAD_LENGTH  = _VECTOR_LOAD_LENGTH,  ///< Number of items per vectorized load
+    };
+
+    static const BlockReduceAlgorithm  BLOCK_ALGORITHM      = _BLOCK_ALGORITHM;     ///< Cooperative block-wide reduction algorithm to use
+    static const CacheLoadModifier     LOAD_MODIFIER        = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+};
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
+ *
+ * Each thread reduces only the values it loads. If \p FIRST_TILE, this
+ * partial reduction is stored into \p thread_aggregate.  Otherwise it is
+ * accumulated into \p thread_aggregate.
+ */
+template <
+    typename AgentReducePolicy,        ///< Parameterized AgentReducePolicy tuning policy type
+    typename InputIteratorT,           ///< Random-access iterator type for input
+    typename OutputIteratorT,          ///< Random-access iterator type for output
+    typename OffsetT,                  ///< Signed integer type for global offsets
+    typename ReductionOp>              ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+struct AgentReduce
+{
+
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    /// The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    /// Vector type of InputT for data movement
+    typedef typename CubVector<InputT, AgentReducePolicy::VECTOR_LOAD_LENGTH>::Type VectorT;
+
+    /// Input iterator wrapper type (for applying cache modifier)
+    typedef typename If<IsPointer<InputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, InputT, OffsetT>,      // Wrap the native input pointer with CacheModifiedInputIterator
+            InputIteratorT>::Type                                                               // Directly use the supplied input iterator type
+        WrappedInputIteratorT;
+
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS       = AgentReducePolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentReducePolicy::ITEMS_PER_THREAD,
+        VECTOR_LOAD_LENGTH  = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH),
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type
+        ATTEMPT_VECTORIZATION   = (VECTOR_LOAD_LENGTH > 1) &&
+                                    (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) &&
+                                    (IsPointer<InputIteratorT>::VALUE) && Traits<InputT>::PRIMITIVE,
+
+    };
+
+    static const CacheLoadModifier    LOAD_MODIFIER   = AgentReducePolicy::LOAD_MODIFIER;
+    static const BlockReduceAlgorithm BLOCK_ALGORITHM = AgentReducePolicy::BLOCK_ALGORITHM;
+
+    /// Parameterized BlockReduce primitive
+    typedef BlockReduce<OutputT, BLOCK_THREADS, AgentReducePolicy::BLOCK_ALGORITHM> BlockReduceT;
+
+    /// Shared memory type required by this thread block
+    struct _TempStorage
+    {
+        typename BlockReduceT::TempStorage  reduce;
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&           temp_storage;       ///< Reference to temp_storage
+    InputIteratorT          d_in;               ///< Input data to reduce
+    WrappedInputIteratorT   d_wrapped_in;       ///< Wrapped input data to reduce
+    ReductionOp             reduction_op;       ///< Binary reduction operator
+
+
+    //---------------------------------------------------------------------
+    // Utility
+    //---------------------------------------------------------------------
+
+
+    // Whether or not the input is aligned with the vector type (specialized for types we can vectorize)
+    template <typename Iterator>
+    static __device__ __forceinline__ bool IsAligned(
+        Iterator        d_in,
+        Int2Type<true>  /*can_vectorize*/)
+    {
+        return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0;
+    }
+
+    // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize)
+    template <typename Iterator>
+    static __device__ __forceinline__ bool IsAligned(
+        Iterator        /*d_in*/,
+        Int2Type<false> /*can_vectorize*/)
+    {
+        return false;
+    }
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentReduce(
+        TempStorage&            temp_storage,       ///< Reference to temp_storage
+        InputIteratorT          d_in,               ///< Input data to reduce
+        ReductionOp             reduction_op)       ///< Binary reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_wrapped_in(d_in),
+        reduction_op(reduction_op)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Tile consumption
+    //---------------------------------------------------------------------
+
+    /**
+     * Consume a full tile of input (non-vectorized)
+     */
+    template <int IS_FIRST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OutputT                 &thread_aggregate,
+        OffsetT                 block_offset,       ///< The offset the tile to consume
+        int                     /*valid_items*/,    ///< The number of valid items in the tile
+        Int2Type<true>          /*is_full_tile*/,   ///< Whether or not this is a full tile
+        Int2Type<false>         /*can_vectorize*/)  ///< Whether or not we can vectorize loads
+    {
+        OutputT items[ITEMS_PER_THREAD];
+
+        // Load items in striped fashion
+        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_wrapped_in + block_offset, items);
+
+        // Reduce items within each thread stripe
+        thread_aggregate = (IS_FIRST_TILE) ?
+            internal::ThreadReduce(items, reduction_op) :
+            internal::ThreadReduce(items, reduction_op, thread_aggregate);
+    }
+
+
+    /**
+     * Consume a full tile of input (vectorized)
+     */
+    template <int IS_FIRST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OutputT                 &thread_aggregate,
+        OffsetT                 block_offset,       ///< The offset the tile to consume
+        int                     /*valid_items*/,    ///< The number of valid items in the tile
+        Int2Type<true>          /*is_full_tile*/,   ///< Whether or not this is a full tile
+        Int2Type<true>          /*can_vectorize*/)  ///< Whether or not we can vectorize loads
+    {
+        // Alias items as an array of VectorT and load it in striped fashion
+        enum { WORDS =  ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };
+
+        // Fabricate a vectorized input iterator
+        InputT *d_in_unqualified = const_cast<InputT*>(d_in) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH);
+        CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, VectorT, OffsetT> d_vec_in(
+            reinterpret_cast<VectorT*>(d_in_unqualified));
+
+        // Load items as vector items
+        InputT input_items[ITEMS_PER_THREAD];
+        VectorT *vec_items = reinterpret_cast<VectorT*>(input_items);
+        #pragma unroll
+        for (int i = 0; i < WORDS; ++i)
+            vec_items[i] = d_vec_in[BLOCK_THREADS * i];
+
+        // Convert from input type to output type
+        OutputT items[ITEMS_PER_THREAD];
+        #pragma unroll
+        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+            items[i] = input_items[i];
+
+        // Reduce items within each thread stripe
+        thread_aggregate = (IS_FIRST_TILE) ?
+            internal::ThreadReduce(items, reduction_op) :
+            internal::ThreadReduce(items, reduction_op, thread_aggregate);
+    }
+
+
+    /**
+     * Consume a partial tile of input
+     */
+    template <int IS_FIRST_TILE, int CAN_VECTORIZE>
+    __device__ __forceinline__ void ConsumeTile(
+        OutputT                 &thread_aggregate,
+        OffsetT                 block_offset,       ///< The offset the tile to consume
+        int                     valid_items,        ///< The number of valid items in the tile
+        Int2Type<false>         /*is_full_tile*/,   ///< Whether or not this is a full tile
+        Int2Type<CAN_VECTORIZE> /*can_vectorize*/)  ///< Whether or not we can vectorize loads
+    {
+        // Partial tile
+        int thread_offset = threadIdx.x;
+
+        // Read first item
+        if ((IS_FIRST_TILE) && (thread_offset < valid_items))
+        {
+            thread_aggregate = d_wrapped_in[block_offset + thread_offset];
+            thread_offset += BLOCK_THREADS;
+        }
+
+        // Continue reading items (block-striped)
+        while (thread_offset < valid_items)
+        {
+            OutputT item        = d_wrapped_in[block_offset + thread_offset];
+            thread_aggregate    = reduction_op(thread_aggregate, item);
+            thread_offset       += BLOCK_THREADS;
+        }
+    }
+
+
+    //---------------------------------------------------------------
+    // Consume a contiguous segment of tiles
+    //---------------------------------------------------------------------
+
+    /**
+     * \brief Reduce a contiguous segment of input tiles
+     */
+    template <int CAN_VECTORIZE>
+    __device__ __forceinline__ OutputT ConsumeRange(
+        GridEvenShare<OffsetT> &even_share,          ///< GridEvenShare descriptor
+        Int2Type<CAN_VECTORIZE> can_vectorize)      ///< Whether or not we can vectorize loads
+    {
+        OutputT thread_aggregate;
+
+        if (even_share.block_offset + TILE_ITEMS > even_share.block_end)
+        {
+            // First tile isn't full (not all threads have valid items)
+            int valid_items = even_share.block_end - even_share.block_offset;
+            ConsumeTile<true>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
+            return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
+        }
+
+        // At least one full block
+        ConsumeTile<true>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
+        even_share.block_offset += even_share.block_stride;
+
+        // Consume subsequent full tiles of input
+        while (even_share.block_offset + TILE_ITEMS <= even_share.block_end)
+        {
+            ConsumeTile<false>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
+            even_share.block_offset += even_share.block_stride;
+        }
+
+        // Consume a partially-full tile
+        if (even_share.block_offset < even_share.block_end)
+        {
+            int valid_items = even_share.block_end - even_share.block_offset;
+            ConsumeTile<false>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
+        }
+
+        // Compute block-wide reduction (all threads have valid items)
+        return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op);
+    }
+
+
+    /**
+     * \brief Reduce a contiguous segment of input tiles
+     */
+    __device__ __forceinline__ OutputT ConsumeRange(
+        OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        OffsetT block_end)                          ///< [in] Threadblock end offset (exclusive)
+    {
+        GridEvenShare<OffsetT> even_share;
+        even_share.template BlockInit<TILE_ITEMS>(block_offset, block_end);
+
+        return (IsAligned(d_in + block_offset, Int2Type<ATTEMPT_VECTORIZATION>())) ?
+            ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
+            ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
+    }
+
+
+    /**
+     * Reduce a contiguous segment of input tiles
+     */
+    __device__ __forceinline__ OutputT ConsumeTiles(
+        GridEvenShare<OffsetT> &even_share)        ///< [in] GridEvenShare descriptor
+    {
+        // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block
+        even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_STRIP_MINE>();
+
+        return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?
+            ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
+            ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
+
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/agent/agent_reduce_by_key.cuh b/cpp/nvgraph/external/cub_semiring/agent/agent_reduce_by_key.cuh
new file mode 100644
index 00000000000..a57d60ea210
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/agent/agent_reduce_by_key.cuh
@@ -0,0 +1,549 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/constant_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentReduceByKey
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentReduceByKeyPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key
+ */
+template <
+    typename    AgentReduceByKeyPolicyT,        ///< Parameterized AgentReduceByKeyPolicy tuning policy type
+    typename    KeysInputIteratorT,             ///< Random-access input iterator type for keys
+    typename    UniqueOutputIteratorT,          ///< Random-access output iterator type for keys
+    typename    ValuesInputIteratorT,           ///< Random-access input iterator type for values
+    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
+    typename    NumRunsOutputIteratorT,         ///< Output iterator type for recording number of items selected
+    typename    EqualityOpT,                    ///< KeyT equality operator type
+    typename    ReductionOpT,                   ///< ValueT reduction operator type
+    typename    OffsetT>                        ///< Signed integer type for global offsets
+struct AgentReduceByKey
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // The input keys type
+    typedef typename std::iterator_traits<KeysInputIteratorT>::value_type KeyInputT;
+
+    // The output keys type
+    typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE),    // KeyOutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<KeysInputIteratorT>::value_type,                                              // ... then the input iterator's value type,
+        typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type KeyOutputT;                         // ... else the output iterator's value type
+
+    // The input values type
+    typedef typename std::iterator_traits<ValuesInputIteratorT>::value_type ValueInputT;
+
+    // The output values type
+    typedef typename If<(Equals<typename std::iterator_traits<AggregatesOutputIteratorT>::value_type, void>::VALUE),    // ValueOutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<ValuesInputIteratorT>::value_type,                                                // ... then the input iterator's value type,
+        typename std::iterator_traits<AggregatesOutputIteratorT>::value_type>::Type ValueOutputT;                       // ... else the output iterator's value type
+
+    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
+    typedef KeyValuePair<OffsetT, ValueOutputT> OffsetValuePairT;
+
+    // Tuple type for pairing keys and values
+    typedef KeyValuePair<KeyOutputT, ValueOutputT> KeyValuePairT;
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<ValueOutputT, OffsetT> ScanTileStateT;
+
+    // Guarded inequality functor
+    template <typename _EqualityOpT>
+    struct GuardedInequalityWrapper
+    {
+        _EqualityOpT     op;             ///< Wrapped equality operator
+        int             num_remaining;  ///< Items remaining
+
+        /// Constructor
+        __host__ __device__ __forceinline__
+        GuardedInequalityWrapper(_EqualityOpT op, int num_remaining) : op(op), num_remaining(num_remaining) {}
+
+        /// Boolean inequality operator, returns <tt>(a != b)</tt>
+        template <typename T>
+        __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b, int idx) const
+        {
+            if (idx < num_remaining)
+                return !op(a, b);   // In bounds
+
+            // Return true if first out-of-bounds item, false otherwise
+            return (idx == num_remaining);
+       }
+    };
+
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = AgentReduceByKeyPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentReduceByKeyPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+        TWO_PHASE_SCATTER   = (ITEMS_PER_THREAD > 1),
+
+        // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type)
+        HAS_IDENTITY_ZERO   = (Equals<ReductionOpT, cub::Sum>::VALUE) && (Traits<ValueOutputT>::PRIMITIVE),
+    };
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys
+    typedef typename If<IsPointer<KeysInputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, KeyInputT, OffsetT>,     // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            KeysInputIteratorT>::Type                                                                   // Directly use the supplied input iterator type
+        WrappedKeysInputIteratorT;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for values
+    typedef typename If<IsPointer<ValuesInputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, ValueInputT, OffsetT>,   // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            ValuesInputIteratorT>::Type                                                                 // Directly use the supplied input iterator type
+        WrappedValuesInputIteratorT;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values
+    typedef typename If<IsPointer<AggregatesOutputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, ValueInputT, OffsetT>,   // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            AggregatesOutputIteratorT>::Type                                                            // Directly use the supplied input iterator type
+        WrappedFixupInputIteratorT;
+
+    // Reduce-value-by-segment scan operator
+    typedef ReduceBySegmentOp<ReductionOpT> ReduceBySegmentOpT;
+
+    // Parameterized BlockLoad type for keys
+    typedef BlockLoad<
+            KeyOutputT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentReduceByKeyPolicyT::LOAD_ALGORITHM>
+        BlockLoadKeysT;
+
+    // Parameterized BlockLoad type for values
+    typedef BlockLoad<
+            ValueOutputT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentReduceByKeyPolicyT::LOAD_ALGORITHM>
+        BlockLoadValuesT;
+
+    // Parameterized BlockDiscontinuity type for keys
+    typedef BlockDiscontinuity<
+            KeyOutputT,
+            BLOCK_THREADS>
+        BlockDiscontinuityKeys;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            OffsetValuePairT,
+            BLOCK_THREADS,
+            AgentReduceByKeyPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            OffsetValuePairT,
+            ReduceBySegmentOpT,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Key and value exchange types
+    typedef KeyOutputT    KeyExchangeT[TILE_ITEMS + 1];
+    typedef ValueOutputT  ValueExchangeT[TILE_ITEMS + 1];
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        struct
+        {
+            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
+            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
+            typename BlockDiscontinuityKeys::TempStorage    discontinuity;  // Smem needed for discontinuity detection
+        };
+
+        // Smem needed for loading keys
+        typename BlockLoadKeysT::TempStorage load_keys;
+
+        // Smem needed for loading values
+        typename BlockLoadValuesT::TempStorage load_values;
+
+        // Smem needed for compacting key value pairs(allows non POD items in this union)
+        Uninitialized<KeyValuePairT[TILE_ITEMS + 1]> raw_exchange;
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+    WrappedKeysInputIteratorT       d_keys_in;          ///< Input keys
+    UniqueOutputIteratorT           d_unique_out;       ///< Unique output keys
+    WrappedValuesInputIteratorT     d_values_in;        ///< Input values
+    AggregatesOutputIteratorT       d_aggregates_out;   ///< Output value aggregates
+    NumRunsOutputIteratorT          d_num_runs_out;     ///< Output pointer for total number of segments identified
+    EqualityOpT                     equality_op;        ///< KeyT equality operator
+    ReductionOpT                    reduction_op;       ///< Reduction operator
+    ReduceBySegmentOpT              scan_op;            ///< Reduce-by-segment scan operator
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentReduceByKey(
+        TempStorage&                temp_storage,       ///< Reference to temp_storage
+        KeysInputIteratorT          d_keys_in,          ///< Input keys
+        UniqueOutputIteratorT       d_unique_out,       ///< Unique output keys
+        ValuesInputIteratorT        d_values_in,        ///< Input values
+        AggregatesOutputIteratorT   d_aggregates_out,   ///< Output value aggregates
+        NumRunsOutputIteratorT      d_num_runs_out,     ///< Output pointer for total number of segments identified
+        EqualityOpT                 equality_op,        ///< KeyT equality operator
+        ReductionOpT                reduction_op)       ///< ValueT reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(d_keys_in),
+        d_unique_out(d_unique_out),
+        d_values_in(d_values_in),
+        d_aggregates_out(d_aggregates_out),
+        d_num_runs_out(d_num_runs_out),
+        equality_op(equality_op),
+        reduction_op(reduction_op),
+        scan_op(reduction_op)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Scatter utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Directly scatter flagged items to output offsets
+     */
+    __device__ __forceinline__ void ScatterDirect(
+        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_indices)[ITEMS_PER_THREAD])
+    {
+        // Scatter flagged keys and values
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (segment_flags[ITEM])
+            {
+                d_unique_out[segment_indices[ITEM]]     = scatter_items[ITEM].key;
+                d_aggregates_out[segment_indices[ITEM]] = scatter_items[ITEM].value;
+            }
+        }
+    }
+
+
+    /**
+     * 2-phase scatter flagged items to output offsets
+     *
+     * The exclusive scan causes each head flag to be paired with the previous
+     * value aggregate: the scatter offsets must be decremented for value aggregates
+     */
+    __device__ __forceinline__ void ScatterTwoPhase(
+        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_indices)[ITEMS_PER_THREAD],
+        OffsetT         num_tile_segments,
+        OffsetT         num_tile_segments_prefix)
+    {
+        CTA_SYNC();
+
+        // Compact and scatter pairs
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (segment_flags[ITEM])
+            {
+                temp_storage.raw_exchange.Alias()[segment_indices[ITEM] - num_tile_segments_prefix] = scatter_items[ITEM];
+            }
+        }
+
+        CTA_SYNC();
+
+        for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS)
+        {
+            KeyValuePairT pair                                  = temp_storage.raw_exchange.Alias()[item];
+            d_unique_out[num_tile_segments_prefix + item]       = pair.key;
+            d_aggregates_out[num_tile_segments_prefix + item]   = pair.value;
+        }
+    }
+
+
+    /**
+     * Scatter flagged items
+     */
+    __device__ __forceinline__ void Scatter(
+        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_indices)[ITEMS_PER_THREAD],
+        OffsetT         num_tile_segments,
+        OffsetT         num_tile_segments_prefix)
+    {
+        // Do a one-phase scatter if (a) two-phase is disabled or (b) the average number of selected items per thread is less than one
+        if (TWO_PHASE_SCATTER && (num_tile_segments > BLOCK_THREADS))
+        {
+            ScatterTwoPhase(
+                scatter_items,
+                segment_flags,
+                segment_indices,
+                num_tile_segments,
+                num_tile_segments_prefix);
+        }
+        else
+        {
+            ScatterDirect(
+                scatter_items,
+                segment_flags,
+                segment_indices);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (dynamic chained scan)
+     */
+    template <bool IS_LAST_TILE>                ///< Whether the current tile is the last tile
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        KeyOutputT          keys[ITEMS_PER_THREAD];             // Tile keys
+        KeyOutputT          prev_keys[ITEMS_PER_THREAD];        // Tile keys shuffled up
+        ValueOutputT        values[ITEMS_PER_THREAD];           // Tile values
+        OffsetT             head_flags[ITEMS_PER_THREAD];       // Segment head flags
+        OffsetT             segment_indices[ITEMS_PER_THREAD];  // Segment indices
+        OffsetValuePairT    scan_items[ITEMS_PER_THREAD];       // Zipped values and segment flags|indices
+        KeyValuePairT       scatter_items[ITEMS_PER_THREAD];    // Zipped key value pairs for scattering
+
+        // Load keys
+        if (IS_LAST_TILE)
+            BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys, num_remaining);
+        else
+            BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys);
+
+        // Load tile predecessor key in first thread
+        KeyOutputT tile_predecessor;
+        if (threadIdx.x == 0)
+        {
+            tile_predecessor = (tile_idx == 0) ?
+                keys[0] :                       // First tile gets repeat of first item (thus first item will not be flagged as a head)
+                d_keys_in[tile_offset - 1];     // Subsequent tiles get last key from previous tile
+        }
+
+        CTA_SYNC();
+
+        // Load values
+        if (IS_LAST_TILE)
+            BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values, num_remaining);
+        else
+            BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values);
+
+        CTA_SYNC();
+
+        // Initialize head-flags and shuffle up the previous keys
+        if (IS_LAST_TILE)
+        {
+            // Use custom flag operator to additionally flag the first out-of-bounds item
+            GuardedInequalityWrapper<EqualityOpT> flag_op(equality_op, num_remaining);
+            BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(
+                head_flags, keys, prev_keys, flag_op, tile_predecessor);
+        }
+        else
+        {
+            InequalityWrapper<EqualityOpT> flag_op(equality_op);
+            BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(
+                head_flags, keys, prev_keys, flag_op, tile_predecessor);
+        }
+
+        // Zip values and head flags
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            scan_items[ITEM].value  = values[ITEM];
+            scan_items[ITEM].key    = head_flags[ITEM];
+        }
+
+        // Perform exclusive tile scan
+        OffsetValuePairT    block_aggregate;        // Inclusive block-wide scan aggregate
+        OffsetT             num_segments_prefix;    // Number of segments prior to this tile
+        ValueOutputT        total_aggregate;        // The tile prefix folded with block_aggregate
+        if (tile_idx == 0)
+        {
+            // Scan first tile
+            BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, block_aggregate);
+            num_segments_prefix     = 0;
+            total_aggregate         = block_aggregate.value;
+
+            // Update tile status if there are successor tiles
+            if ((!IS_LAST_TILE) && (threadIdx.x == 0))
+                tile_state.SetInclusive(0, block_aggregate);
+        }
+        else
+        {
+            // Scan non-first tile
+            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
+            BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, prefix_op);
+
+            block_aggregate         = prefix_op.GetBlockAggregate();
+            num_segments_prefix     = prefix_op.GetExclusivePrefix().key;
+            total_aggregate         = reduction_op(
+                                        prefix_op.GetExclusivePrefix().value,
+                                        block_aggregate.value);
+        }
+
+        // Rezip scatter items and segment indices
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            scatter_items[ITEM].key     = prev_keys[ITEM];
+            scatter_items[ITEM].value   = scan_items[ITEM].value;
+            segment_indices[ITEM]       = scan_items[ITEM].key;
+        }
+
+        // At this point, each flagged segment head has:
+        //  - The key for the previous segment
+        //  - The reduced value from the previous segment
+        //  - The segment index for the reduced value
+
+        // Scatter flagged keys and values
+        OffsetT num_tile_segments = block_aggregate.key;
+        Scatter(scatter_items, head_flags, segment_indices, num_tile_segments, num_segments_prefix);
+
+        // Last thread in last tile will output final count (and last pair, if necessary)
+        if ((IS_LAST_TILE) && (threadIdx.x == BLOCK_THREADS - 1))
+        {
+            OffsetT num_segments = num_segments_prefix + num_tile_segments;
+
+            // If the last tile is a whole tile, output the final_value
+            if (num_remaining == TILE_ITEMS)
+            {
+                d_unique_out[num_segments]      = keys[ITEMS_PER_THREAD - 1];
+                d_aggregates_out[num_segments]  = total_aggregate;
+                num_segments++;
+            }
+
+            // Output the total number of items selected
+            *d_num_runs_out = num_segments;
+        }
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_items,          ///< Total number of input items
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        int                 start_tile)         ///< The starting tile for the current grid
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = start_tile + blockIdx.x;          // Current tile index
+        OffsetT tile_offset     = OffsetT(TILE_ITEMS) * tile_idx;   // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;          // Remaining items (including this tile)
+
+        if (num_remaining > TILE_ITEMS)
+        {
+            // Not last tile
+            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+        else if (num_remaining > 0)
+        {
+            // Last tile
+            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/agent/agent_rle.cuh b/cpp/nvgraph/external/cub_semiring/agent/agent_rle.cuh
new file mode 100644
index 00000000000..0ba9216176c
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/agent/agent_rle.cuh
@@ -0,0 +1,837 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_exchange.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/constant_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentRle
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    bool                        _STORE_WARP_TIME_SLICING,       ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentRlePolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+        STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING,     ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode 
+ */
+template <
+    typename    AgentRlePolicyT,        ///< Parameterized AgentRlePolicyT tuning policy type
+    typename    InputIteratorT,         ///< Random-access input iterator type for data
+    typename    OffsetsOutputIteratorT, ///< Random-access output iterator type for offset values
+    typename    LengthsOutputIteratorT, ///< Random-access output iterator type for length values
+    typename    EqualityOpT,            ///< T equality operator type
+    typename    OffsetT>                ///< Signed integer type for global offsets
+struct AgentRle
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type T;
+
+    /// The lengths output value type
+    typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
+        OffsetT,                                                                                                    // ... then the OffsetT type,
+        typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
+
+    /// Tuple type for scanning (pairs run-length and run-index)
+    typedef KeyValuePair<OffsetT, LengthT> LengthOffsetPair;
+
+    /// Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<LengthT, OffsetT> ScanTileStateT;
+
+    // Constants
+    enum
+    {
+        WARP_THREADS            = CUB_WARP_THREADS(PTX_ARCH),
+        BLOCK_THREADS           = AgentRlePolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentRlePolicyT::ITEMS_PER_THREAD,
+        WARP_ITEMS              = WARP_THREADS * ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        /// Whether or not to sync after loading data
+        SYNC_AFTER_LOAD         = (AgentRlePolicyT::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT),
+
+        /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+        STORE_WARP_TIME_SLICING = AgentRlePolicyT::STORE_WARP_TIME_SLICING,
+        ACTIVE_EXCHANGE_WARPS   = (STORE_WARP_TIME_SLICING) ? 1 : WARPS,
+    };
+
+
+    /**
+     * Special operator that signals all out-of-bounds items are not equal to everything else,
+     * forcing both (1) the last item to be tail-flagged and (2) all oob items to be marked
+     * trivial.
+     */
+    template <bool LAST_TILE>
+    struct OobInequalityOp
+    {
+        OffsetT         num_remaining;
+        EqualityOpT      equality_op;
+
+        __device__ __forceinline__ OobInequalityOp(
+            OffsetT     num_remaining,
+            EqualityOpT  equality_op)
+        :
+            num_remaining(num_remaining),
+            equality_op(equality_op)
+        {}
+
+        template <typename Index>
+        __host__ __device__ __forceinline__ bool operator()(T first, T second, Index idx)
+        {
+            if (!LAST_TILE || (idx < num_remaining))
+                return !equality_op(first, second);
+            else
+                return true;
+        }
+    };
+
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for data
+    typedef typename If<IsPointer<InputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentRlePolicyT::LOAD_MODIFIER, T, OffsetT>,      // Wrap the native input pointer with CacheModifiedVLengthnputIterator
+            InputIteratorT>::Type                                                       // Directly use the supplied input iterator type
+        WrappedInputIteratorT;
+
+    // Parameterized BlockLoad type for data
+    typedef BlockLoad<
+            T,
+            AgentRlePolicyT::BLOCK_THREADS,
+            AgentRlePolicyT::ITEMS_PER_THREAD,
+            AgentRlePolicyT::LOAD_ALGORITHM>
+        BlockLoadT;
+
+    // Parameterized BlockDiscontinuity type for data
+    typedef BlockDiscontinuity<T, BLOCK_THREADS> BlockDiscontinuityT;
+
+    // Parameterized WarpScan type
+    typedef WarpScan<LengthOffsetPair> WarpScanPairs;
+
+    // Reduce-length-by-run scan operator
+    typedef ReduceBySegmentOp<cub::Sum> ReduceBySegmentOpT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            LengthOffsetPair,
+            ReduceBySegmentOpT,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Warp exchange types
+    typedef WarpExchange<LengthOffsetPair, ITEMS_PER_THREAD>        WarpExchangePairs;
+
+    typedef typename If<STORE_WARP_TIME_SLICING, typename WarpExchangePairs::TempStorage, NullType>::Type WarpExchangePairsStorage;
+
+    typedef WarpExchange<OffsetT, ITEMS_PER_THREAD>                 WarpExchangeOffsets;
+    typedef WarpExchange<LengthT, ITEMS_PER_THREAD>                 WarpExchangeLengths;
+
+    typedef LengthOffsetPair WarpAggregates[WARPS];
+
+    // Shared memory type for this thread block
+    struct _TempStorage
+    {
+        // Aliasable storage layout
+        union Aliasable
+        {
+            struct
+            {
+                typename BlockDiscontinuityT::TempStorage       discontinuity;              // Smem needed for discontinuity detection
+                typename WarpScanPairs::TempStorage             warp_scan[WARPS];           // Smem needed for warp-synchronous scans
+                Uninitialized<LengthOffsetPair[WARPS]>          warp_aggregates;            // Smem needed for sharing warp-wide aggregates
+                typename TilePrefixCallbackOpT::TempStorage     prefix;                     // Smem needed for cooperative prefix callback
+            };
+
+            // Smem needed for input loading
+            typename BlockLoadT::TempStorage                    load;
+
+            // Aliasable layout needed for two-phase scatter
+            union ScatterAliasable
+            {
+                unsigned long long                              align;
+                WarpExchangePairsStorage                        exchange_pairs[ACTIVE_EXCHANGE_WARPS];
+                typename WarpExchangeOffsets::TempStorage       exchange_offsets[ACTIVE_EXCHANGE_WARPS];
+                typename WarpExchangeLengths::TempStorage       exchange_lengths[ACTIVE_EXCHANGE_WARPS];
+
+            } scatter_aliasable;
+
+        } aliasable;
+
+        OffsetT             tile_idx;                   // Shared tile index
+        LengthOffsetPair    tile_inclusive;             // Inclusive tile prefix
+        LengthOffsetPair    tile_exclusive;             // Exclusive tile prefix
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+
+    WrappedInputIteratorT           d_in;               ///< Pointer to input sequence of data items
+    OffsetsOutputIteratorT          d_offsets_out;      ///< Input run offsets
+    LengthsOutputIteratorT          d_lengths_out;      ///< Output run lengths
+
+    EqualityOpT                     equality_op;        ///< T equality operator
+    ReduceBySegmentOpT              scan_op;            ///< Reduce-length-by-flag scan operator
+    OffsetT                         num_items;          ///< Total number of input items
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentRle(
+        TempStorage                 &temp_storage,      ///< [in] Reference to temp_storage
+        InputIteratorT              d_in,               ///< [in] Pointer to input sequence of data items
+        OffsetsOutputIteratorT      d_offsets_out,      ///< [out] Pointer to output sequence of run offsets
+        LengthsOutputIteratorT      d_lengths_out,      ///< [out] Pointer to output sequence of run lengths
+        EqualityOpT                 equality_op,        ///< [in] T equality operator
+        OffsetT                     num_items)          ///< [in] Total number of input items
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_offsets_out(d_offsets_out),
+        d_lengths_out(d_lengths_out),
+        equality_op(equality_op),
+        scan_op(cub::Sum()),
+        num_items(num_items)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods for initializing the selections
+    //---------------------------------------------------------------------
+
+    template <bool FIRST_TILE, bool LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT             tile_offset,
+        OffsetT             num_remaining,
+        T                   (&items)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_num_runs)[ITEMS_PER_THREAD])
+    {
+        bool                head_flags[ITEMS_PER_THREAD];
+        bool                tail_flags[ITEMS_PER_THREAD];
+
+        OobInequalityOp<LAST_TILE> inequality_op(num_remaining, equality_op);
+
+        if (FIRST_TILE && LAST_TILE)
+        {
+            // First-and-last-tile always head-flags the first item and tail-flags the last item
+
+            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+                head_flags, tail_flags, items, inequality_op);
+        }
+        else if (FIRST_TILE)
+        {
+            // First-tile always head-flags the first item
+
+            // Get the first item from the next tile
+            T tile_successor_item;
+            if (threadIdx.x == BLOCK_THREADS - 1)
+                tile_successor_item = d_in[tile_offset + TILE_ITEMS];
+
+            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+                head_flags, tail_flags, tile_successor_item, items, inequality_op);
+        }
+        else if (LAST_TILE)
+        {
+            // Last-tile always flags the last item
+
+            // Get the last item from the previous tile
+            T tile_predecessor_item;
+            if (threadIdx.x == 0)
+                tile_predecessor_item = d_in[tile_offset - 1];
+
+            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+                head_flags, tile_predecessor_item, tail_flags, items, inequality_op);
+        }
+        else
+        {
+            // Get the first item from the next tile
+            T tile_successor_item;
+            if (threadIdx.x == BLOCK_THREADS - 1)
+                tile_successor_item = d_in[tile_offset + TILE_ITEMS];
+
+            // Get the last item from the previous tile
+            T tile_predecessor_item;
+            if (threadIdx.x == 0)
+                tile_predecessor_item = d_in[tile_offset - 1];
+
+            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+                head_flags, tile_predecessor_item, tail_flags, tile_successor_item, items, inequality_op);
+        }
+
+        // Zip counts and runs
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            lengths_and_num_runs[ITEM].key      = head_flags[ITEM] && (!tail_flags[ITEM]);
+            lengths_and_num_runs[ITEM].value    = ((!head_flags[ITEM]) || (!tail_flags[ITEM]));
+        }
+    }
+
+    //---------------------------------------------------------------------
+    // Scan utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Scan of allocations
+     */
+    __device__ __forceinline__ void WarpScanAllocations(
+        LengthOffsetPair    &tile_aggregate,
+        LengthOffsetPair    &warp_aggregate,
+        LengthOffsetPair    &warp_exclusive_in_tile,
+        LengthOffsetPair    &thread_exclusive_in_warp,
+        LengthOffsetPair    (&lengths_and_num_runs)[ITEMS_PER_THREAD])
+    {
+        // Perform warpscans
+        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+        int lane_id = LaneId();
+
+        LengthOffsetPair identity;
+        identity.key = 0;
+        identity.value = 0;
+
+        LengthOffsetPair thread_inclusive;
+        LengthOffsetPair thread_aggregate = internal::ThreadReduce(lengths_and_num_runs, scan_op);
+        WarpScanPairs(temp_storage.aliasable.warp_scan[warp_id]).Scan(
+            thread_aggregate,
+            thread_inclusive,
+            thread_exclusive_in_warp,
+            identity,
+            scan_op);
+
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.aliasable.warp_aggregates.Alias()[warp_id] = thread_inclusive;
+
+        CTA_SYNC();
+
+        // Accumulate total selected and the warp-wide prefix
+        warp_exclusive_in_tile          = identity;
+        warp_aggregate                  = temp_storage.aliasable.warp_aggregates.Alias()[warp_id];
+        tile_aggregate                  = temp_storage.aliasable.warp_aggregates.Alias()[0];
+
+        #pragma unroll
+        for (int WARP = 1; WARP < WARPS; ++WARP)
+        {
+            if (warp_id == WARP)
+                warp_exclusive_in_tile = tile_aggregate;
+
+            tile_aggregate = scan_op(tile_aggregate, temp_storage.aliasable.warp_aggregates.Alias()[WARP]);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Utility methods for scattering selections
+    //---------------------------------------------------------------------
+
+    /**
+     * Two-phase scatter, specialized for warp time-slicing
+     */
+    template <bool FIRST_TILE>
+    __device__ __forceinline__ void ScatterTwoPhase(
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD],
+        Int2Type<true>      is_warp_time_slice)
+    {
+        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+        int lane_id = LaneId();
+
+        // Locally compact items within the warp (first warp)
+        if (warp_id == 0)
+        {
+            WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped(
+                lengths_and_offsets, thread_num_runs_exclusive_in_warp);
+        }
+
+        // Locally compact items within the warp (remaining warps)
+        #pragma unroll
+        for (int SLICE = 1; SLICE < WARPS; ++SLICE)
+        {
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped(
+                    lengths_and_offsets, thread_num_runs_exclusive_in_warp);
+            }
+        }
+
+        // Global scatter
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if ((ITEM * WARP_THREADS) < warp_num_runs_aggregate - lane_id)
+            {
+                OffsetT item_offset =
+                    tile_num_runs_exclusive_in_global +
+                    warp_num_runs_exclusive_in_tile +
+                    (ITEM * WARP_THREADS) + lane_id;
+
+                // Scatter offset
+                d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;
+
+                // Scatter length if not the first (global) length
+                if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
+                {
+                    d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Two-phase scatter
+     */
+    template <bool FIRST_TILE>
+    __device__ __forceinline__ void ScatterTwoPhase(
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD],
+        Int2Type<false>     is_warp_time_slice)
+    {
+        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+        int lane_id = LaneId();
+
+        // Unzip
+        OffsetT run_offsets[ITEMS_PER_THREAD];
+        LengthT run_lengths[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            run_offsets[ITEM] = lengths_and_offsets[ITEM].key;
+            run_lengths[ITEM] = lengths_and_offsets[ITEM].value;
+        }
+
+        WarpExchangeOffsets(temp_storage.aliasable.scatter_aliasable.exchange_offsets[warp_id]).ScatterToStriped(
+            run_offsets, thread_num_runs_exclusive_in_warp);
+
+        WARP_SYNC(0xffffffff);
+
+        WarpExchangeLengths(temp_storage.aliasable.scatter_aliasable.exchange_lengths[warp_id]).ScatterToStriped(
+            run_lengths, thread_num_runs_exclusive_in_warp);
+
+        // Global scatter
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if ((ITEM * WARP_THREADS) + lane_id < warp_num_runs_aggregate)
+            {
+                OffsetT item_offset =
+                    tile_num_runs_exclusive_in_global +
+                    warp_num_runs_exclusive_in_tile +
+                    (ITEM * WARP_THREADS) + lane_id;
+
+                // Scatter offset
+                d_offsets_out[item_offset] = run_offsets[ITEM];
+
+                // Scatter length if not the first (global) length
+                if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
+                {
+                    d_lengths_out[item_offset - 1] = run_lengths[ITEM];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Direct scatter
+     */
+    template <bool FIRST_TILE>
+    __device__ __forceinline__ void ScatterDirect(
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD])
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (thread_num_runs_exclusive_in_warp[ITEM] < warp_num_runs_aggregate)
+            {
+                OffsetT item_offset =
+                    tile_num_runs_exclusive_in_global +
+                    warp_num_runs_exclusive_in_tile +
+                    thread_num_runs_exclusive_in_warp[ITEM];
+
+                // Scatter offset
+                d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;
+
+                // Scatter length if not the first (global) length
+                if (item_offset >= 1)
+                {
+                    d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Scatter
+     */
+    template <bool FIRST_TILE>
+    __device__ __forceinline__ void Scatter(
+        OffsetT             tile_num_runs_aggregate,
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD])
+    {
+        if ((ITEMS_PER_THREAD == 1) || (tile_num_runs_aggregate < BLOCK_THREADS))
+        {
+            // Direct scatter if the warp has any items
+            if (warp_num_runs_aggregate)
+            {
+                ScatterDirect<FIRST_TILE>(
+                    tile_num_runs_exclusive_in_global,
+                    warp_num_runs_aggregate,
+                    warp_num_runs_exclusive_in_tile,
+                    thread_num_runs_exclusive_in_warp,
+                    lengths_and_offsets);
+            }
+        }
+        else
+        {
+            // Scatter two phase
+            ScatterTwoPhase<FIRST_TILE>(
+                tile_num_runs_exclusive_in_global,
+                warp_num_runs_aggregate,
+                warp_num_runs_exclusive_in_tile,
+                thread_num_runs_exclusive_in_warp,
+                lengths_and_offsets,
+                Int2Type<STORE_WARP_TIME_SLICING>());
+        }
+    }
+
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (dynamic chained scan)
+     */
+    template <
+        bool                LAST_TILE>
+    __device__ __forceinline__ LengthOffsetPair ConsumeTile(
+        OffsetT             num_items,          ///< Total number of global input items
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,       ///< Tile offset
+        ScanTileStateT       &tile_status)       ///< Global list of tile status
+    {
+        if (tile_idx == 0)
+        {
+            // First tile
+
+            // Load items
+            T items[ITEMS_PER_THREAD];
+            if (LAST_TILE)
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T());
+            else
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items);
+
+            if (SYNC_AFTER_LOAD)
+                CTA_SYNC();
+
+            // Set flags
+            LengthOffsetPair    lengths_and_num_runs[ITEMS_PER_THREAD];
+
+            InitializeSelections<true, LAST_TILE>(
+                tile_offset,
+                num_remaining,
+                items,
+                lengths_and_num_runs);
+
+            // Exclusive scan of lengths and runs
+            LengthOffsetPair tile_aggregate;
+            LengthOffsetPair warp_aggregate;
+            LengthOffsetPair warp_exclusive_in_tile;
+            LengthOffsetPair thread_exclusive_in_warp;
+
+            WarpScanAllocations(
+                tile_aggregate,
+                warp_aggregate,
+                warp_exclusive_in_tile,
+                thread_exclusive_in_warp,
+                lengths_and_num_runs);
+
+            // Update tile status if this is not the last tile
+            if (!LAST_TILE && (threadIdx.x == 0))
+                tile_status.SetInclusive(0, tile_aggregate);
+
+            // Update thread_exclusive_in_warp to fold in warp run-length
+            if (thread_exclusive_in_warp.key == 0)
+                thread_exclusive_in_warp.value += warp_exclusive_in_tile.value;
+
+            LengthOffsetPair    lengths_and_offsets[ITEMS_PER_THREAD];
+            OffsetT             thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
+            LengthOffsetPair    lengths_and_num_runs2[ITEMS_PER_THREAD];
+
+            // Downsweep scan through lengths_and_num_runs
+            internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);
+
+            // Zip
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                lengths_and_offsets[ITEM].value         = lengths_and_num_runs2[ITEM].value;
+                lengths_and_offsets[ITEM].key        = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+                thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ?
+                                                                lengths_and_num_runs2[ITEM].key :         // keep
+                                                                WARP_THREADS * ITEMS_PER_THREAD;            // discard
+            }
+
+            OffsetT tile_num_runs_aggregate              = tile_aggregate.key;
+            OffsetT tile_num_runs_exclusive_in_global    = 0;
+            OffsetT warp_num_runs_aggregate              = warp_aggregate.key;
+            OffsetT warp_num_runs_exclusive_in_tile      = warp_exclusive_in_tile.key;
+
+            // Scatter
+            Scatter<true>(
+                tile_num_runs_aggregate,
+                tile_num_runs_exclusive_in_global,
+                warp_num_runs_aggregate,
+                warp_num_runs_exclusive_in_tile,
+                thread_num_runs_exclusive_in_warp,
+                lengths_and_offsets);
+
+            // Return running total (inclusive of this tile)
+            return tile_aggregate;
+        }
+        else
+        {
+            // Not first tile
+
+            // Load items
+            T items[ITEMS_PER_THREAD];
+            if (LAST_TILE)
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T());
+            else
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items);
+
+            if (SYNC_AFTER_LOAD)
+                CTA_SYNC();
+
+            // Set flags
+            LengthOffsetPair    lengths_and_num_runs[ITEMS_PER_THREAD];
+
+            InitializeSelections<false, LAST_TILE>(
+                tile_offset,
+                num_remaining,
+                items,
+                lengths_and_num_runs);
+
+            // Exclusive scan of lengths and runs
+            LengthOffsetPair tile_aggregate;
+            LengthOffsetPair warp_aggregate;
+            LengthOffsetPair warp_exclusive_in_tile;
+            LengthOffsetPair thread_exclusive_in_warp;
+
+            WarpScanAllocations(
+                tile_aggregate,
+                warp_aggregate,
+                warp_exclusive_in_tile,
+                thread_exclusive_in_warp,
+                lengths_and_num_runs);
+
+            // First warp computes tile prefix in lane 0
+            TilePrefixCallbackOpT prefix_op(tile_status, temp_storage.aliasable.prefix, Sum(), tile_idx);
+            unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+            if (warp_id == 0)
+            {
+                prefix_op(tile_aggregate);
+                if (threadIdx.x == 0)
+                    temp_storage.tile_exclusive = prefix_op.exclusive_prefix;
+            }
+
+            CTA_SYNC();
+
+            LengthOffsetPair tile_exclusive_in_global = temp_storage.tile_exclusive;
+
+            // Update thread_exclusive_in_warp to fold in warp and tile run-lengths
+            LengthOffsetPair thread_exclusive = scan_op(tile_exclusive_in_global, warp_exclusive_in_tile);
+            if (thread_exclusive_in_warp.key == 0)
+                thread_exclusive_in_warp.value += thread_exclusive.value;
+
+            // Downsweep scan through lengths_and_num_runs
+            LengthOffsetPair    lengths_and_num_runs2[ITEMS_PER_THREAD];
+            LengthOffsetPair    lengths_and_offsets[ITEMS_PER_THREAD];
+            OffsetT             thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
+
+            internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);
+
+            // Zip
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                lengths_and_offsets[ITEM].value         = lengths_and_num_runs2[ITEM].value;
+                lengths_and_offsets[ITEM].key        = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+                thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ?
+                                                                lengths_and_num_runs2[ITEM].key :         // keep
+                                                                WARP_THREADS * ITEMS_PER_THREAD;            // discard
+            }
+
+            OffsetT tile_num_runs_aggregate              = tile_aggregate.key;
+            OffsetT tile_num_runs_exclusive_in_global    = tile_exclusive_in_global.key;
+            OffsetT warp_num_runs_aggregate              = warp_aggregate.key;
+            OffsetT warp_num_runs_exclusive_in_tile      = warp_exclusive_in_tile.key;
+
+            // Scatter
+            Scatter<false>(
+                tile_num_runs_aggregate,
+                tile_num_runs_exclusive_in_global,
+                warp_num_runs_aggregate,
+                warp_num_runs_exclusive_in_tile,
+                thread_num_runs_exclusive_in_warp,
+                lengths_and_offsets);
+
+            // Return running total (inclusive of this tile)
+            return prefix_op.inclusive_prefix;
+        }
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    template <typename NumRunsIteratorT>            ///< Output iterator type for recording number of items selected
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_tiles,              ///< Total number of input tiles
+        ScanTileStateT&     tile_status,            ///< Global list of tile status
+        NumRunsIteratorT    d_num_runs_out)         ///< Output pointer for total number of runs identified
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                  // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;                  // Remaining items (including this tile)
+
+        if (tile_idx < num_tiles - 1)
+        {
+            // Not the last tile (full)
+            ConsumeTile<false>(num_items, num_remaining, tile_idx, tile_offset, tile_status);
+        }
+        else if (num_remaining > 0)
+        {
+            // The last tile (possibly partially-full)
+            LengthOffsetPair running_total = ConsumeTile<true>(num_items, num_remaining, tile_idx, tile_offset, tile_status);
+
+            if (threadIdx.x == 0)
+            {
+                // Output the total number of items selected
+                *d_num_runs_out = running_total.key;
+
+                // The inclusive prefix contains accumulated length reduction for the last run
+                if (running_total.key > 0)
+                    d_lengths_out[running_total.key - 1] = running_total.value;
+            }
+        }
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/agent/agent_scan.cuh b/cpp/nvgraph/external/cub_semiring/agent/agent_scan.cuh
new file mode 100644
index 00000000000..567df8049e9
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/agent/agent_scan.cuh
@@ -0,0 +1,471 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan .
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentScan
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockStoreAlgorithm         _STORE_ALGORITHM,               ///< The BlockStore algorithm to use
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentScanPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
+    static const BlockStoreAlgorithm    STORE_ALGORITHM         = _STORE_ALGORITHM;         ///< The BlockStore algorithm to use
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;          ///< The BlockScan algorithm to use
+};
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan .
+ */
+template <
+    typename AgentScanPolicyT,      ///< Parameterized AgentScanPolicyT tuning policy type
+    typename InputIteratorT,        ///< Random-access input iterator type
+    typename OutputIteratorT,       ///< Random-access output iterator type
+    typename ScanOpT,               ///< Scan functor type
+    typename InitValueT,            ///< The init_value element for ScanOpT type (cub::NullType for inclusive scan)
+    typename OffsetT>               ///< Signed integer type for global offsets
+struct AgentScan
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // Tile status descriptor interface type
+    typedef ScanTileState<OutputT> ScanTileStateT;
+
+    // Input iterator wrapper type (for applying cache modifier)
+    typedef typename If<IsPointer<InputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentScanPolicyT::LOAD_MODIFIER, InputT, OffsetT>,   // Wrap the native input pointer with CacheModifiedInputIterator
+            InputIteratorT>::Type                                                           // Directly use the supplied input iterator type
+        WrappedInputIteratorT;
+
+    // Constants
+    enum
+    {
+        IS_INCLUSIVE        = Equals<InitValueT, NullType>::VALUE,            // Inclusive scan if no init_value type is provided
+        BLOCK_THREADS       = AgentScanPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentScanPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    // Parameterized BlockLoad type
+    typedef BlockLoad<
+            OutputT,
+            AgentScanPolicyT::BLOCK_THREADS,
+            AgentScanPolicyT::ITEMS_PER_THREAD,
+            AgentScanPolicyT::LOAD_ALGORITHM>
+        BlockLoadT;
+
+    // Parameterized BlockStore type
+    typedef BlockStore<
+            OutputT,
+            AgentScanPolicyT::BLOCK_THREADS,
+            AgentScanPolicyT::ITEMS_PER_THREAD,
+            AgentScanPolicyT::STORE_ALGORITHM>
+        BlockStoreT;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            OutputT,
+            AgentScanPolicyT::BLOCK_THREADS,
+            AgentScanPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            OutputT,
+            ScanOpT,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles
+    typedef BlockScanRunningPrefixOp<
+            OutputT,
+            ScanOpT>
+        RunningPrefixCallbackOp;
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        typename BlockLoadT::TempStorage    load;       // Smem needed for tile loading
+        typename BlockStoreT::TempStorage   store;      // Smem needed for tile storing
+
+        struct
+        {
+            typename TilePrefixCallbackOpT::TempStorage  prefix;     // Smem needed for cooperative prefix callback
+            typename BlockScanT::TempStorage             scan;       // Smem needed for tile scanning
+        };
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&               temp_storage;       ///< Reference to temp_storage
+    WrappedInputIteratorT       d_in;               ///< Input data
+    OutputIteratorT             d_out;              ///< Output data
+    ScanOpT                     scan_op;            ///< Binary scan operator
+    InitValueT                  init_value;         ///< The init_value element for ScanOpT
+
+
+    //---------------------------------------------------------------------
+    // Block scan utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Exclusive scan specialization (first tile)
+     */
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        OutputT             init_value,
+        ScanOpT             scan_op,
+        OutputT             &block_aggregate,
+        Int2Type<false>     /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, init_value, scan_op, block_aggregate);
+        block_aggregate = scan_op(init_value, block_aggregate);
+    }
+
+
+    /**
+     * Inclusive scan specialization (first tile)
+     */
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        InitValueT          /*init_value*/,
+        ScanOpT             scan_op,
+        OutputT             &block_aggregate,
+        Int2Type<true>      /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate);
+    }
+
+
+    /**
+     * Exclusive scan specialization (subsequent tiles)
+     */
+    template <typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        ScanOpT             scan_op,
+        PrefixCallback      &prefix_op,
+        Int2Type<false>     /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, scan_op, prefix_op);
+    }
+
+
+    /**
+     * Inclusive scan specialization (subsequent tiles)
+     */
+    template <typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        ScanOpT             scan_op,
+        PrefixCallback      &prefix_op,
+        Int2Type<true>      /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, prefix_op);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentScan(
+        TempStorage&    temp_storage,       ///< Reference to temp_storage
+        InputIteratorT  d_in,               ///< Input data
+        OutputIteratorT d_out,              ///< Output data
+        ScanOpT         scan_op,            ///< Binary scan operator
+        InitValueT      init_value)         ///< Initial value to seed the exclusive scan
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_out(d_out),
+        scan_op(scan_op),
+        init_value(init_value)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (dynamic chained scan)
+     */
+    template <bool IS_LAST_TILE>                ///< Whether the current tile is the last tile
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        // Load items
+        OutputT items[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, num_remaining);
+        else
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
+
+        CTA_SYNC();
+
+        // Perform tile scan
+        if (tile_idx == 0)
+        {
+            // Scan first tile
+            OutputT block_aggregate;
+            ScanTile(items, init_value, scan_op, block_aggregate, Int2Type<IS_INCLUSIVE>());
+            if ((!IS_LAST_TILE) && (threadIdx.x == 0))
+                tile_state.SetInclusive(0, block_aggregate);
+        }
+        else
+        {
+            // Scan non-first tile
+            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
+            ScanTile(items, scan_op, prefix_op, Int2Type<IS_INCLUSIVE>());
+        }
+
+        CTA_SYNC();
+
+        // Store items
+        if (IS_LAST_TILE)
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, num_remaining);
+        else
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items);
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_items,          ///< Total number of input items
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        int                 start_tile)         ///< The starting tile for the current grid
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = start_tile + blockIdx.x;          // Current tile index
+        OffsetT tile_offset     = OffsetT(TILE_ITEMS) * tile_idx;   // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;          // Remaining items (including this tile)
+
+        if (num_remaining > TILE_ITEMS)
+        {
+            // Not last tile
+            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+        else if (num_remaining > 0)
+        {
+            // Last tile
+            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Scan an sequence of consecutive tiles (independent of other thread blocks)
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input
+     */
+    template <
+        bool                        IS_FIRST_TILE,
+        bool                        IS_LAST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT                     tile_offset,                ///< Tile offset
+        RunningPrefixCallbackOp&    prefix_op,                  ///< Running prefix operator
+        int                         valid_items = TILE_ITEMS)   ///< Number of valid items in the tile
+    {
+        // Load items
+        OutputT items[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, valid_items);
+        else
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
+
+        CTA_SYNC();
+
+        // Block scan
+        if (IS_FIRST_TILE)
+        {
+            OutputT block_aggregate;
+            ScanTile(items, init_value, scan_op, block_aggregate, Int2Type<IS_INCLUSIVE>());
+            prefix_op.running_total = block_aggregate;
+        }
+        else
+        {
+            ScanTile(items, scan_op, prefix_op, Int2Type<IS_INCLUSIVE>());
+        }
+
+        CTA_SYNC();
+
+        // Store items
+        if (IS_LAST_TILE)
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, valid_items);
+        else
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items);
+    }
+
+
+    /**
+     * Scan a consecutive share of input tiles
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        OffsetT  range_offset,      ///< [in] Threadblock begin offset (inclusive)
+        OffsetT  range_end)         ///< [in] Threadblock end offset (exclusive)
+    {
+        BlockScanRunningPrefixOp<OutputT, ScanOpT> prefix_op(scan_op);
+
+        if (range_offset + TILE_ITEMS <= range_end)
+        {
+            // Consume first tile of input (full)
+            ConsumeTile<true, true>(range_offset, prefix_op);
+            range_offset += TILE_ITEMS;
+
+            // Consume subsequent full tiles of input
+            while (range_offset + TILE_ITEMS <= range_end)
+            {
+                ConsumeTile<false, true>(range_offset, prefix_op);
+                range_offset += TILE_ITEMS;
+            }
+
+            // Consume a partially-full tile
+            if (range_offset < range_end)
+            {
+                int valid_items = range_end - range_offset;
+                ConsumeTile<false, false>(range_offset, prefix_op, valid_items);
+            }
+        }
+        else
+        {
+            // Consume the first tile of input (partially-full)
+            int valid_items = range_end - range_offset;
+            ConsumeTile<true, false>(range_offset, prefix_op, valid_items);
+        }
+    }
+
+
+    /**
+     * Scan a consecutive share of input tiles, seeded with the specified prefix value
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        OffsetT range_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        OffsetT range_end,                          ///< [in] Threadblock end offset (exclusive)
+        OutputT prefix)                             ///< [in] The prefix to apply to the scan segment
+    {
+        BlockScanRunningPrefixOp<OutputT, ScanOpT> prefix_op(prefix, scan_op);
+
+        // Consume full tiles of input
+        while (range_offset + TILE_ITEMS <= range_end)
+        {
+            ConsumeTile<true, false>(range_offset, prefix_op);
+            range_offset += TILE_ITEMS;
+        }
+
+        // Consume a partially-full tile
+        if (range_offset < range_end)
+        {
+            int valid_items = range_end - range_offset;
+            ConsumeTile<false, false>(range_offset, prefix_op, valid_items);
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/agent/agent_segment_fixup.cuh b/cpp/nvgraph/external/cub_semiring/agent/agent_segment_fixup.cuh
new file mode 100644
index 00000000000..cb6e5772580
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/agent/agent_segment_fixup.cuh
@@ -0,0 +1,385 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/constant_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentSegmentFixup
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentSegmentFixupPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key
+ */
+template <
+    typename    AgentSegmentFixupPolicyT,       ///< Parameterized AgentSegmentFixupPolicy tuning policy type
+    typename    PairsInputIteratorT,            ///< Random-access input iterator type for keys
+    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
+    typename    EqualityOpT,                    ///< KeyT equality operator type
+    typename    ReductionOpT,                   ///< ValueT reduction operator type
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    typename    SemiringT>                      ///< Semiring type
+struct AgentSegmentFixup
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Data type of key-value input iterator
+    typedef typename std::iterator_traits<PairsInputIteratorT>::value_type KeyValuePairT;
+
+    // Value type
+    typedef typename KeyValuePairT::Value ValueT;
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<ValueT, OffsetT> ScanTileStateT;
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = AgentSegmentFixupPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentSegmentFixupPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        // Whether or not do fixup using RLE + global atomics
+        // double atomics starting with 6.0
+        USE_ATOMIC_FIXUP    = (((CUB_PTX_ARCH >= 350) && 
+                                (Equals<ValueT, float>::VALUE || 
+                                 Equals<ValueT, int>::VALUE ||
+                                 Equals<ValueT, unsigned int>::VALUE ||
+                                 Equals<ValueT, unsigned long long>::VALUE)) 
+                                ||
+                                ((CUB_PTX_ARCH >= 600) && 
+                                (Equals<ValueT, double>::VALUE)))
+                                && SemiringT::HAS_PLUS_ATOMICS, // don't use atomics for semirings like maxmin
+
+        // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type)
+        // not used. 
+        //HAS_IDENTITY_ZERO   = (Equals<ReductionOpT, cub::Sum>::VALUE) && (Traits<ValueT>::PRIMITIVE),
+    };
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys
+    typedef typename If<IsPointer<PairsInputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentSegmentFixupPolicyT::LOAD_MODIFIER, KeyValuePairT, OffsetT>,    // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            PairsInputIteratorT>::Type                                                                      // Directly use the supplied input iterator type
+        WrappedPairsInputIteratorT;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values
+    typedef typename If<IsPointer<AggregatesOutputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentSegmentFixupPolicyT::LOAD_MODIFIER, ValueT, OffsetT>,    // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            AggregatesOutputIteratorT>::Type                                                        // Directly use the supplied input iterator type
+        WrappedFixupInputIteratorT;
+
+    // Reduce-value-by-segment scan operator
+    typedef ReduceByKeyOp<typename SemiringT::SumOp> ReduceBySegmentOpT;
+
+    // Parameterized BlockLoad type for pairs
+    typedef BlockLoad<
+            KeyValuePairT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentSegmentFixupPolicyT::LOAD_ALGORITHM>
+        BlockLoadPairs;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            KeyValuePairT,
+            BLOCK_THREADS,
+            AgentSegmentFixupPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            KeyValuePairT,
+            ReduceBySegmentOpT,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        struct
+        {
+            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
+            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
+        };
+
+        // Smem needed for loading keys
+        typename BlockLoadPairs::TempStorage load_pairs;
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+    WrappedPairsInputIteratorT      d_pairs_in;          ///< Input keys
+    AggregatesOutputIteratorT       d_aggregates_out;   ///< Output value aggregates
+    WrappedFixupInputIteratorT      d_fixup_in;         ///< Fixup input values
+    InequalityWrapper<EqualityOpT>  inequality_op;      ///< KeyT inequality operator
+    ReductionOpT                    reduction_op;       ///< Reduction operator
+    ReduceBySegmentOpT              scan_op;            ///< Reduce-by-segment scan operator
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentSegmentFixup(
+        TempStorage&                temp_storage,       ///< Reference to temp_storage
+        PairsInputIteratorT         d_pairs_in,          ///< Input keys
+        AggregatesOutputIteratorT   d_aggregates_out,   ///< Output value aggregates
+        EqualityOpT                 equality_op,        ///< KeyT equality operator
+        ReductionOpT                reduction_op)       ///< ValueT reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_pairs_in(d_pairs_in),
+        d_aggregates_out(d_aggregates_out),
+        d_fixup_in(d_aggregates_out),
+        inequality_op(equality_op),
+        reduction_op(reduction_op),
+        scan_op(reduction_op)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Process input tile.  Specialized for atomic-fixup
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             max_item,           ///< maximum item key, to prevent OOB writes
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        Int2Type<true>      use_atomic_fixup)   ///< Marker whether to use atomicAdd (instead of reduce-by-key)
+    {
+        KeyValuePairT   pairs[ITEMS_PER_THREAD];
+
+        // Load pairs
+        KeyValuePairT oob_pair;
+        oob_pair.key = -1;
+
+        if (IS_LAST_TILE)
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair);
+        else
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);
+
+        // RLE 
+        #pragma unroll
+        for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            ValueT* d_scatter = d_aggregates_out + pairs[ITEM - 1].key;
+            if (pairs[ITEM].key != pairs[ITEM - 1].key && pairs[ITEM - 1].key < max_item)
+                atomicAdd(d_scatter, pairs[ITEM - 1].value);
+            else
+                pairs[ITEM].value = reduction_op(pairs[ITEM - 1].value, pairs[ITEM].value);
+        }
+
+        // Flush last item if valid
+        ValueT* d_scatter = d_aggregates_out + pairs[ITEMS_PER_THREAD - 1].key;
+        if ((!IS_LAST_TILE) || (pairs[ITEMS_PER_THREAD - 1].key >= 0))
+            atomicAdd(d_scatter, pairs[ITEMS_PER_THREAD - 1].value);
+    }
+
+
+    /**
+     * Process input tile.  Specialized for reduce-by-key fixup
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             max_item,           ///< maximum item key, to prevent OOB writes
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        Int2Type<false>     use_atomic_fixup)   ///< Marker whether to use atomicAdd (instead of reduce-by-key)
+    {
+        KeyValuePairT   pairs[ITEMS_PER_THREAD];
+        KeyValuePairT   scatter_pairs[ITEMS_PER_THREAD];
+
+        // Load pairs
+        KeyValuePairT oob_pair;
+        oob_pair.key = -1;
+
+        if (IS_LAST_TILE)
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair);
+        else
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);
+
+        CTA_SYNC();
+
+        KeyValuePairT tile_aggregate;
+        if (tile_idx == 0)
+        {
+            // Exclusive scan of values and segment_flags
+            BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, tile_aggregate);
+
+            // Update tile status if this is not the last tile
+            if (threadIdx.x == 0)
+            {
+                // Set first segment id to not trigger a flush (invalid from exclusive scan)
+                scatter_pairs[0].key = pairs[0].key;
+
+                if (!IS_LAST_TILE)
+                    tile_state.SetInclusive(0, tile_aggregate);
+
+            }
+        }
+        else
+        {
+            // Exclusive scan of values and segment_flags
+            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
+            BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, prefix_op);
+            tile_aggregate = prefix_op.GetBlockAggregate();
+        }
+
+        // Scatter updated values
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (scatter_pairs[ITEM].key != pairs[ITEM].key && scatter_pairs[ITEM].key < max_item)
+            {
+                // Update the value at the key location
+                ValueT value    = d_fixup_in[scatter_pairs[ITEM].key];
+                value           = reduction_op(value, scatter_pairs[ITEM].value);
+
+                d_aggregates_out[scatter_pairs[ITEM].key] = value;
+            }
+        }
+
+        // Finalize the last item
+        if (IS_LAST_TILE)
+        {
+            // Last thread will output final count and last item, if necessary
+            if (threadIdx.x == BLOCK_THREADS - 1)
+            {
+                // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment
+                if (num_remaining == TILE_ITEMS)
+                {
+                    // Update the value at the key location
+                    OffsetT last_key = pairs[ITEMS_PER_THREAD - 1].key;
+                    d_aggregates_out[last_key] = reduction_op(tile_aggregate.value, d_fixup_in[last_key]);
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        OffsetT             max_item,
+        int                 num_items,          ///< Total number of input items
+        int                 num_tiles,          ///< Total number of input tiles
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                    // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;                  // Remaining items (including this tile)
+
+        if (num_remaining > TILE_ITEMS)
+        {
+            // Not the last tile (full)
+            ConsumeTile<false>(max_item, num_remaining, tile_idx, tile_offset, tile_state, Int2Type<USE_ATOMIC_FIXUP>());
+        }
+        else if (num_remaining > 0)
+        {
+            // The last tile (possibly partially-full)
+            ConsumeTile<true>(max_item, num_remaining, tile_idx, tile_offset, tile_state, Int2Type<USE_ATOMIC_FIXUP>());
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/agent/agent_select_if.cuh b/cpp/nvgraph/external/cub_semiring/agent/agent_select_if.cuh
new file mode 100644
index 00000000000..f365481915b
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/agent/agent_select_if.cuh
@@ -0,0 +1,703 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide select.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_exchange.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentSelectIf
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentSelectIfPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+
+/**
+ * \brief AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection
+ *
+ * Performs functor-based selection if SelectOpT functor type != NullType
+ * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
+ * Otherwise performs discontinuity selection (keep unique)
+ */
+template <
+    typename    AgentSelectIfPolicyT,           ///< Parameterized AgentSelectIfPolicy tuning policy type
+    typename    InputIteratorT,                 ///< Random-access input iterator type for selection items
+    typename    FlagsInputIteratorT,            ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection)
+    typename    SelectedOutputIteratorT,        ///< Random-access input iterator type for selection_flags items
+    typename    SelectOpT,                      ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection)
+    typename    EqualityOpT,                    ///< Equality operator type (NullType if selection functor or selections is to be used for selection)
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
+struct AgentSelectIf
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<SelectedOutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                                  // ... then the input iterator's value type,
+        typename std::iterator_traits<SelectedOutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // The flag value type
+    typedef typename std::iterator_traits<FlagsInputIteratorT>::value_type FlagT;
+
+    // Tile status descriptor interface type
+    typedef ScanTileState<OffsetT> ScanTileStateT;
+
+    // Constants
+    enum
+    {
+        USE_SELECT_OP,
+        USE_SELECT_FLAGS,
+        USE_DISCONTINUITY,
+
+        BLOCK_THREADS           = AgentSelectIfPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentSelectIfPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+        TWO_PHASE_SCATTER       = (ITEMS_PER_THREAD > 1),
+
+        SELECT_METHOD           = (!Equals<SelectOpT, NullType>::VALUE) ?
+                                    USE_SELECT_OP :
+                                    (!Equals<FlagT, NullType>::VALUE) ?
+                                        USE_SELECT_FLAGS :
+                                        USE_DISCONTINUITY
+    };
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for items
+    typedef typename If<IsPointer<InputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentSelectIfPolicyT::LOAD_MODIFIER, InputT, OffsetT>,        // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            InputIteratorT>::Type                                                               // Directly use the supplied input iterator type
+        WrappedInputIteratorT;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for values
+    typedef typename If<IsPointer<FlagsInputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentSelectIfPolicyT::LOAD_MODIFIER, FlagT, OffsetT>,    // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            FlagsInputIteratorT>::Type                                                          // Directly use the supplied input iterator type
+        WrappedFlagsInputIteratorT;
+
+    // Parameterized BlockLoad type for input data
+    typedef BlockLoad<
+            OutputT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentSelectIfPolicyT::LOAD_ALGORITHM>
+        BlockLoadT;
+
+    // Parameterized BlockLoad type for flags
+    typedef BlockLoad<
+            FlagT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentSelectIfPolicyT::LOAD_ALGORITHM>
+        BlockLoadFlags;
+
+    // Parameterized BlockDiscontinuity type for items
+    typedef BlockDiscontinuity<
+            OutputT,
+            BLOCK_THREADS>
+        BlockDiscontinuityT;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            OffsetT,
+            BLOCK_THREADS,
+            AgentSelectIfPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            OffsetT,
+            cub::Sum,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Item exchange type
+    typedef OutputT ItemExchangeT[TILE_ITEMS];
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        struct
+        {
+            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
+            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
+            typename BlockDiscontinuityT::TempStorage       discontinuity;  // Smem needed for discontinuity detection
+        };
+
+        // Smem needed for loading items
+        typename BlockLoadT::TempStorage load_items;
+
+        // Smem needed for loading values
+        typename BlockLoadFlags::TempStorage load_flags;
+
+        // Smem needed for compacting items (allows non POD items in this union)
+        Uninitialized<ItemExchangeT> raw_exchange;
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+    WrappedInputIteratorT           d_in;               ///< Input items
+    SelectedOutputIteratorT         d_selected_out;     ///< Unique output items
+    WrappedFlagsInputIteratorT      d_flags_in;         ///< Input selection flags (if applicable)
+    InequalityWrapper<EqualityOpT>  inequality_op;      ///< T inequality operator
+    SelectOpT                       select_op;          ///< Selection operator
+    OffsetT                         num_items;          ///< Total number of input items
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentSelectIf(
+        TempStorage                 &temp_storage,      ///< Reference to temp_storage
+        InputIteratorT              d_in,               ///< Input data
+        FlagsInputIteratorT         d_flags_in,         ///< Input selection flags (if applicable)
+        SelectedOutputIteratorT     d_selected_out,     ///< Output data
+        SelectOpT                   select_op,          ///< Selection operator
+        EqualityOpT                 equality_op,        ///< Equality operator
+        OffsetT                     num_items)          ///< Total number of input items
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_flags_in(d_flags_in),
+        d_selected_out(d_selected_out),
+        select_op(select_op),
+        inequality_op(equality_op),
+        num_items(num_items)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods for initializing the selections
+    //---------------------------------------------------------------------
+
+    /**
+     * Initialize selections (specialized for selection operator)
+     */
+    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT                     /*tile_offset*/,
+        OffsetT                     num_tile_items,
+        OutputT                     (&items)[ITEMS_PER_THREAD],
+        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
+        Int2Type<USE_SELECT_OP>     /*select_method*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            // Out-of-bounds items are selection_flags
+            selection_flags[ITEM] = 1;
+
+            if (!IS_LAST_TILE || (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items))
+                selection_flags[ITEM] = select_op(items[ITEM]);
+        }
+    }
+
+
+    /**
+     * Initialize selections (specialized for valid flags)
+     */
+    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT                     tile_offset,
+        OffsetT                     num_tile_items,
+        OutputT                     (&/*items*/)[ITEMS_PER_THREAD],
+        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
+        Int2Type<USE_SELECT_FLAGS>  /*select_method*/)
+    {
+        CTA_SYNC();
+
+        FlagT flags[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+        {
+            // Out-of-bounds items are selection_flags
+            BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags, num_tile_items, 1);
+        }
+        else
+        {
+            BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags);
+        }
+
+        // Convert flag type to selection_flags type
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            selection_flags[ITEM] = flags[ITEM];
+        }
+    }
+
+
+    /**
+     * Initialize selections (specialized for discontinuity detection)
+     */
+    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT                     tile_offset,
+        OffsetT                     num_tile_items,
+        OutputT                     (&items)[ITEMS_PER_THREAD],
+        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
+        Int2Type<USE_DISCONTINUITY> /*select_method*/)
+    {
+        if (IS_FIRST_TILE)
+        {
+            CTA_SYNC();
+
+            // Set head selection_flags.  First tile sets the first flag for the first item
+            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op);
+        }
+        else
+        {
+            OutputT tile_predecessor;
+            if (threadIdx.x == 0)
+                tile_predecessor = d_in[tile_offset - 1];
+
+            CTA_SYNC();
+
+            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op, tile_predecessor);
+        }
+
+        // Set selection flags for out-of-bounds items
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            // Set selection_flags for out-of-bounds items
+            if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items))
+                selection_flags[ITEM] = 1;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Scatter utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Scatter flagged items to output offsets (specialized for direct scattering)
+     */
+    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+    __device__ __forceinline__ void ScatterDirect(
+        OutputT (&items)[ITEMS_PER_THREAD],
+        OffsetT (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT (&selection_indices)[ITEMS_PER_THREAD],
+        OffsetT num_selections)
+    {
+        // Scatter flagged items
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (selection_flags[ITEM])
+            {
+                if ((!IS_LAST_TILE) || selection_indices[ITEM] < num_selections)
+                {
+                    d_selected_out[selection_indices[ITEM]] = items[ITEM];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Scatter flagged items to output offsets (specialized for two-phase scattering)
+     */
+    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+    __device__ __forceinline__ void ScatterTwoPhase(
+        OutputT         (&items)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
+        int             /*num_tile_items*/,                         ///< Number of valid items in this tile
+        int             num_tile_selections,                        ///< Number of selections in this tile
+        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
+        OffsetT         /*num_rejected_prefix*/,                    ///< Total number of rejections prior to this tile
+        Int2Type<false> /*is_keep_rejects*/)                        ///< Marker type indicating whether to keep rejected items in the second partition
+    {
+        CTA_SYNC();
+
+        // Compact and scatter items
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int local_scatter_offset = selection_indices[ITEM] - num_selections_prefix;
+            if (selection_flags[ITEM])
+            {
+                temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
+            }
+        }
+
+        CTA_SYNC();
+
+        for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
+        {
+            d_selected_out[num_selections_prefix + item] = temp_storage.raw_exchange.Alias()[item];
+        }
+    }
+
+
+    /**
+     * Scatter flagged items to output offsets (specialized for two-phase scattering)
+     */
+    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+    __device__ __forceinline__ void ScatterTwoPhase(
+        OutputT         (&items)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
+        int             num_tile_items,                             ///< Number of valid items in this tile
+        int             num_tile_selections,                        ///< Number of selections in this tile
+        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
+        OffsetT         num_rejected_prefix,                        ///< Total number of rejections prior to this tile
+        Int2Type<true>  /*is_keep_rejects*/)                        ///< Marker type indicating whether to keep rejected items in the second partition
+    {
+        CTA_SYNC();
+
+        int tile_num_rejections = num_tile_items - num_tile_selections;
+
+        // Scatter items to shared memory (rejections first)
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int item_idx                = (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+            int local_selection_idx     = selection_indices[ITEM] - num_selections_prefix;
+            int local_rejection_idx     = item_idx - local_selection_idx;
+            int local_scatter_offset    = (selection_flags[ITEM]) ?
+                                            tile_num_rejections + local_selection_idx :
+                                            local_rejection_idx;
+
+            temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        // Gather items from shared memory and scatter to global
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int item_idx            = (ITEM * BLOCK_THREADS) + threadIdx.x;
+            int rejection_idx       = item_idx;
+            int selection_idx       = item_idx - tile_num_rejections;
+            OffsetT scatter_offset  = (item_idx < tile_num_rejections) ?
+                                        num_items - num_rejected_prefix - rejection_idx - 1 :
+                                        num_selections_prefix + selection_idx;
+
+            OutputT item = temp_storage.raw_exchange.Alias()[item_idx];
+
+            if (!IS_LAST_TILE || (item_idx < num_tile_items))
+            {
+                d_selected_out[scatter_offset] = item;
+            }
+        }
+    }
+
+
+    /**
+     * Scatter flagged items
+     */
+    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+    __device__ __forceinline__ void Scatter(
+        OutputT         (&items)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
+        int             num_tile_items,                             ///< Number of valid items in this tile
+        int             num_tile_selections,                        ///< Number of selections in this tile
+        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
+        OffsetT         num_rejected_prefix,                        ///< Total number of rejections prior to this tile
+        OffsetT         num_selections)                             ///< Total number of selections including this tile
+    {
+        // Do a two-phase scatter if (a) keeping both partitions or (b) two-phase is enabled and the average number of selection_flags items per thread is greater than one
+        if (KEEP_REJECTS || (TWO_PHASE_SCATTER && (num_tile_selections > BLOCK_THREADS)))
+        {
+            ScatterTwoPhase<IS_LAST_TILE, IS_FIRST_TILE>(
+                items,
+                selection_flags,
+                selection_indices,
+                num_tile_items,
+                num_tile_selections,
+                num_selections_prefix,
+                num_rejected_prefix,
+                Int2Type<KEEP_REJECTS>());
+        }
+        else
+        {
+            ScatterDirect<IS_LAST_TILE, IS_FIRST_TILE>(
+                items,
+                selection_flags,
+                selection_indices,
+                num_selections);
+        }
+    }
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Process first tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ OffsetT ConsumeFirstTile(
+        int                 num_tile_items,      ///< Number of input items comprising this tile
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        OutputT     items[ITEMS_PER_THREAD];
+        OffsetT     selection_flags[ITEMS_PER_THREAD];
+        OffsetT     selection_indices[ITEMS_PER_THREAD];
+
+        // Load items
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
+        else
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
+
+        // Initialize selection_flags
+        InitializeSelections<true, IS_LAST_TILE>(
+            tile_offset,
+            num_tile_items,
+            items,
+            selection_flags,
+            Int2Type<SELECT_METHOD>());
+
+        CTA_SYNC();
+
+        // Exclusive scan of selection_flags
+        OffsetT num_tile_selections;
+        BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, num_tile_selections);
+
+        if (threadIdx.x == 0)
+        {
+            // Update tile status if this is not the last tile
+            if (!IS_LAST_TILE)
+                tile_state.SetInclusive(0, num_tile_selections);
+        }
+
+        // Discount any out-of-bounds selections
+        if (IS_LAST_TILE)
+            num_tile_selections -= (TILE_ITEMS - num_tile_items);
+
+        // Scatter flagged items
+        Scatter<IS_LAST_TILE, true>(
+            items,
+            selection_flags,
+            selection_indices,
+            num_tile_items,
+            num_tile_selections,
+            0,
+            0,
+            num_tile_selections);
+
+        return num_tile_selections;
+    }
+
+
+    /**
+     * Process subsequent tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ OffsetT ConsumeSubsequentTile(
+        int                 num_tile_items,      ///< Number of input items comprising this tile
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        OutputT     items[ITEMS_PER_THREAD];
+        OffsetT     selection_flags[ITEMS_PER_THREAD];
+        OffsetT     selection_indices[ITEMS_PER_THREAD];
+
+        // Load items
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
+        else
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
+
+        // Initialize selection_flags
+        InitializeSelections<false, IS_LAST_TILE>(
+            tile_offset,
+            num_tile_items,
+            items,
+            selection_flags,
+            Int2Type<SELECT_METHOD>());
+
+        CTA_SYNC();
+
+        // Exclusive scan of values and selection_flags
+        TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, cub::Sum(), tile_idx);
+        BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, prefix_op);
+
+        OffsetT num_tile_selections     = prefix_op.GetBlockAggregate();
+        OffsetT num_selections          = prefix_op.GetInclusivePrefix();
+        OffsetT num_selections_prefix   = prefix_op.GetExclusivePrefix();
+        OffsetT num_rejected_prefix     = (tile_idx * TILE_ITEMS) - num_selections_prefix;
+
+        // Discount any out-of-bounds selections
+        if (IS_LAST_TILE)
+        {
+            int num_discount    = TILE_ITEMS - num_tile_items;
+            num_selections      -= num_discount;
+            num_tile_selections -= num_discount;
+        }
+
+        // Scatter flagged items
+        Scatter<IS_LAST_TILE, false>(
+            items,
+            selection_flags,
+            selection_indices,
+            num_tile_items,
+            num_tile_selections,
+            num_selections_prefix,
+            num_rejected_prefix,
+            num_selections);
+
+        return num_selections;
+    }
+
+
+    /**
+     * Process a tile of input
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ OffsetT ConsumeTile(
+        int                 num_tile_items,         ///< Number of input items comprising this tile
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        OffsetT num_selections;
+        if (tile_idx == 0)
+        {
+            num_selections = ConsumeFirstTile<IS_LAST_TILE>(num_tile_items, tile_offset, tile_state);
+        }
+        else
+        {
+            num_selections = ConsumeSubsequentTile<IS_LAST_TILE>(num_tile_items, tile_idx, tile_offset, tile_state);
+        }
+
+        return num_selections;
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    template <typename NumSelectedIteratorT>        ///< Output iterator type for recording number of items selection_flags
+    __device__ __forceinline__ void ConsumeRange(
+        int                     num_tiles,          ///< Total number of input tiles
+        ScanTileStateT&         tile_state,         ///< Global tile state descriptor
+        NumSelectedIteratorT    d_num_selected_out) ///< Output total number selection_flags
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                    // Global offset for the current tile
+
+        if (tile_idx < num_tiles - 1)
+        {
+            // Not the last tile (full)
+            ConsumeTile<false>(TILE_ITEMS, tile_idx, tile_offset, tile_state);
+        }
+        else
+        {
+            // The last tile (possibly partially-full)
+            OffsetT num_remaining   = num_items - tile_offset;
+            OffsetT num_selections  = ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
+
+            if (threadIdx.x == 0)
+            {
+                // Output the total number of items selection_flags
+                *d_num_selected_out = num_selections;
+            }
+        }
+    }
+
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/agent/agent_spmv_orig.cuh b/cpp/nvgraph/external/cub_semiring/agent/agent_spmv_orig.cuh
new file mode 100644
index 00000000000..65e0d2bd2d7
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/agent/agent_spmv_orig.cuh
@@ -0,0 +1,692 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../util_type.cuh"
+#include "../block/block_reduce.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_exchange.cuh"
+#include "../thread/thread_search.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/counting_input_iterator.cuh"
+#include "../iterator/tex_ref_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentSpmv
+ */
+template <
+    int                             _BLOCK_THREADS,                         ///< Threads per thread block
+    int                             _ITEMS_PER_THREAD,                      ///< Items per thread (per tile of input)
+    CacheLoadModifier               _ROW_OFFSETS_SEARCH_LOAD_MODIFIER,      ///< Cache load modifier for reading CSR row-offsets during search
+    CacheLoadModifier               _ROW_OFFSETS_LOAD_MODIFIER,             ///< Cache load modifier for reading CSR row-offsets
+    CacheLoadModifier               _COLUMN_INDICES_LOAD_MODIFIER,          ///< Cache load modifier for reading CSR column-indices
+    CacheLoadModifier               _VALUES_LOAD_MODIFIER,                  ///< Cache load modifier for reading CSR values
+    CacheLoadModifier               _VECTOR_VALUES_LOAD_MODIFIER,           ///< Cache load modifier for reading vector values
+    bool                            _DIRECT_LOAD_NONZEROS,                  ///< Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through shared memory)
+    BlockScanAlgorithm              _SCAN_ALGORITHM>                        ///< The BlockScan algorithm to use
+struct AgentSpmvPolicy
+{
+    enum
+    {
+        BLOCK_THREADS                                                   = _BLOCK_THREADS,                       ///< Threads per thread block
+        ITEMS_PER_THREAD                                                = _ITEMS_PER_THREAD,                    ///< Items per thread (per tile of input)
+        DIRECT_LOAD_NONZEROS                                            = _DIRECT_LOAD_NONZEROS,                ///< Whether to load nonzeros directly from global during sequential merging (pre-staged through shared memory)
+    };
+
+    static const CacheLoadModifier  ROW_OFFSETS_SEARCH_LOAD_MODIFIER    = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER;    ///< Cache load modifier for reading CSR row-offsets
+    static const CacheLoadModifier  ROW_OFFSETS_LOAD_MODIFIER           = _ROW_OFFSETS_LOAD_MODIFIER;           ///< Cache load modifier for reading CSR row-offsets
+    static const CacheLoadModifier  COLUMN_INDICES_LOAD_MODIFIER        = _COLUMN_INDICES_LOAD_MODIFIER;        ///< Cache load modifier for reading CSR column-indices
+    static const CacheLoadModifier  VALUES_LOAD_MODIFIER                = _VALUES_LOAD_MODIFIER;                ///< Cache load modifier for reading CSR values
+    static const CacheLoadModifier  VECTOR_VALUES_LOAD_MODIFIER         = _VECTOR_VALUES_LOAD_MODIFIER;         ///< Cache load modifier for reading vector values
+    static const BlockScanAlgorithm SCAN_ALGORITHM                      = _SCAN_ALGORITHM;                      ///< The BlockScan algorithm to use
+
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+template <
+    typename        ValueT,              ///< Matrix and vector value type
+    typename        OffsetT>             ///< Signed integer type for sequence offsets
+struct SpmvParams
+{
+    const ValueT*         d_values;            ///< Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
+    const OffsetT*        d_row_end_offsets;   ///< Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
+    const OffsetT*        d_column_indices;    ///< Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
+    const ValueT*         d_vector_x;          ///< Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+    ValueT*         d_vector_y;          ///< Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
+    int             num_rows;            ///< Number of rows of matrix <b>A</b>.
+    int             num_cols;            ///< Number of columns of matrix <b>A</b>.
+    int             num_nonzeros;        ///< Number of nonzero elements of matrix <b>A</b>.
+    ValueT          alpha;               ///< Alpha multiplicand
+    ValueT          beta;                ///< Beta addend-multiplicand
+
+    TexRefInputIterator<ValueT, 66778899, OffsetT>  t_vector_x;
+};
+
+
+/**
+ * \brief AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV.
+ */
+template <
+    typename    AgentSpmvPolicyT,           ///< Parameterized AgentSpmvPolicy tuning policy type
+    typename    ValueT,                     ///< Matrix and vector value type
+    typename    OffsetT,                    ///< Signed integer type for sequence offsets
+    typename    SemiringT,                  ///< Semiring type
+    bool        HAS_ALPHA,                  ///< Whether the input parameter \p alpha is 1
+    bool        HAS_BETA,                   ///< Whether the input parameter \p beta is 0
+    int         PTX_ARCH = CUB_PTX_ARCH>    ///< PTX compute capability
+struct AgentSpmv
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS           = AgentSpmvPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentSpmvPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    /// 2D merge path coordinate type
+    typedef typename CubVector<OffsetT, 2>::Type CoordinateT;
+
+    /// Input iterator wrapper types (for applying cache modifiers)
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        RowOffsetsSearchIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::ROW_OFFSETS_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        RowOffsetsIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::COLUMN_INDICES_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        ColumnIndicesIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::VALUES_LOAD_MODIFIER,
+            ValueT,
+            OffsetT>
+        ValueIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,
+            ValueT,
+            OffsetT>
+        VectorValueIteratorT;
+
+    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
+    typedef KeyValuePair<OffsetT, ValueT> KeyValuePairT;
+
+    // Reduce-value-by-segment scan operator
+    typedef ReduceByKeyOp<typename SemiringT::SumOp> ReduceBySegmentOpT;
+
+    // BlockReduce specialization
+    typedef BlockReduce<
+            ValueT,
+            BLOCK_THREADS,
+            BLOCK_REDUCE_WARP_REDUCTIONS>
+        BlockReduceT;
+
+    // BlockScan specialization
+    typedef BlockScan<
+            KeyValuePairT,
+            BLOCK_THREADS,
+            AgentSpmvPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // BlockScan specialization
+    typedef BlockScan<
+            ValueT,
+            BLOCK_THREADS,
+            AgentSpmvPolicyT::SCAN_ALGORITHM>
+        BlockPrefixSumT;
+
+    // BlockExchange specialization
+    typedef BlockExchange<
+            ValueT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD>
+        BlockExchangeT;
+
+    /// Merge item type (either a non-zero value or a row-end offset)
+    union MergeItem
+    {
+        // Value type to pair with index type OffsetT (NullType if loading values directly during merge)
+        typedef typename If<AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS, NullType, ValueT>::Type MergeValueT;
+
+        OffsetT     row_end_offset;
+        MergeValueT nonzero;
+    };
+
+    /// Shared memory type required by this thread block
+    struct _TempStorage
+    {
+        CoordinateT tile_coords[2];
+
+        union Aliasable
+        {
+            // Smem needed for tile of merge items
+            MergeItem merge_items[ITEMS_PER_THREAD + TILE_ITEMS + 1];
+
+            // Smem needed for block exchange
+            typename BlockExchangeT::TempStorage exchange;
+
+            // Smem needed for block-wide reduction
+            typename BlockReduceT::TempStorage reduce;
+
+            // Smem needed for tile scanning
+            typename BlockScanT::TempStorage scan;
+
+            // Smem needed for tile prefix sum
+            typename BlockPrefixSumT::TempStorage prefix_sum;
+
+        } aliasable;
+    };
+
+    /// Temporary storage type (unionable)
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+
+    _TempStorage&                   temp_storage;         /// Reference to temp_storage
+
+    SpmvParams<ValueT, OffsetT>&    spmv_params;
+
+    ValueIteratorT                  wd_values;            ///< Wrapped pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
+    RowOffsetsIteratorT             wd_row_end_offsets;   ///< Wrapped Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
+    ColumnIndicesIteratorT          wd_column_indices;    ///< Wrapped Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
+    VectorValueIteratorT            wd_vector_x;          ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+    VectorValueIteratorT            wd_vector_y;          ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentSpmv(
+        TempStorage&                    temp_storage,           ///< Reference to temp_storage
+        SpmvParams<ValueT, OffsetT>&    spmv_params)            ///< SpMV input parameter bundle
+    :
+        temp_storage(temp_storage.Alias()),
+        spmv_params(spmv_params),
+        wd_values(spmv_params.d_values),
+        wd_row_end_offsets(spmv_params.d_row_end_offsets),
+        wd_column_indices(spmv_params.d_column_indices),
+        wd_vector_x(spmv_params.d_vector_x),
+        wd_vector_y(spmv_params.d_vector_y)
+    {}
+
+
+
+
+    /**
+     * Consume a merge tile, specialized for direct-load of nonzeros
+     */
+    __device__ __forceinline__ KeyValuePairT ConsumeTile(
+        int             tile_idx,
+        CoordinateT     tile_start_coord,
+        CoordinateT     tile_end_coord,
+        Int2Type<true>  is_direct_load)     ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
+    {
+        int         tile_num_rows           = tile_end_coord.x - tile_start_coord.x;
+        int         tile_num_nonzeros       = tile_end_coord.y - tile_start_coord.y;
+        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
+
+        // Gather the row end-offsets for the merge tile into shared memory
+        for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)
+        {
+            s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item];
+        }
+
+        CTA_SYNC();
+
+        // Search for the thread's starting coordinate within the merge tile
+        CountingInputIterator<OffsetT>  tile_nonzero_indices(tile_start_coord.y);
+        CoordinateT                     thread_start_coord;
+
+        MergePathSearch(
+            OffsetT(threadIdx.x * ITEMS_PER_THREAD),    // Diagonal
+            s_tile_row_end_offsets,                     // List A
+            tile_nonzero_indices,                       // List B
+            tile_num_rows,
+            tile_num_nonzeros,
+            thread_start_coord);
+
+        CTA_SYNC();            // Perf-sync
+
+        // Compute the thread's merge path segment
+        CoordinateT     thread_current_coord = thread_start_coord;
+        KeyValuePairT   scan_segment[ITEMS_PER_THREAD];
+
+        ValueT          running_total = SemiringT::plus_ident();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            OffsetT nonzero_idx         = CUB_MIN(tile_nonzero_indices[thread_current_coord.y], spmv_params.num_nonzeros - 1);
+            OffsetT column_idx          = wd_column_indices[nonzero_idx];
+            ValueT  value               = wd_values[nonzero_idx];
+
+// #if (CUB_PTX_ARCH >= 350)
+//             ValueT vector_value         = wd_vector_x[column_idx];
+// #else
+//             ValueT vector_value         = spmv_params.t_vector_x[column_idx];
+// #endif
+            ValueT  vector_value        = spmv_params.t_vector_x[column_idx];
+#if (CUB_PTX_ARCH >= 350)
+            vector_value                = wd_vector_x[column_idx];
+#endif
+            ValueT nonzero              = SemiringT::times(value, vector_value);
+
+            OffsetT row_end_offset      = s_tile_row_end_offsets[thread_current_coord.x];
+
+            if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
+            {
+                // Move down (accumulate)
+                running_total = SemiringT::plus(nonzero, running_total);
+                scan_segment[ITEM].value    = running_total;
+                scan_segment[ITEM].key      = tile_num_rows;
+                ++thread_current_coord.y;
+            }
+            else
+            {
+                // Move right (reset)
+                scan_segment[ITEM].value    = running_total;
+                scan_segment[ITEM].key      = thread_current_coord.x;
+                running_total               = SemiringT::plus_ident();
+                ++thread_current_coord.x;
+            }
+        }
+
+        CTA_SYNC();
+
+        // Block-wide reduce-value-by-segment
+        KeyValuePairT       tile_carry;
+        ReduceBySegmentOpT  scan_op;
+        KeyValuePairT       scan_item;
+
+        scan_item.value = running_total;
+        scan_item.key   = thread_current_coord.x;
+
+        BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);
+
+        if (tile_num_rows > 0)
+        {
+            if (threadIdx.x == 0)
+                scan_item.key = -1;
+
+            // Direct scatter
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                if (scan_segment[ITEM].key < tile_num_rows)
+                {
+                    if (scan_item.key == scan_segment[ITEM].key)
+                        scan_segment[ITEM].value = SemiringT::plus(scan_item.value, scan_segment[ITEM].value);
+
+                    if (HAS_ALPHA)
+                    {
+                        scan_segment[ITEM].value = SemiringT::times(scan_segment[ITEM].value, spmv_params.alpha);
+                    }
+
+                    if (HAS_BETA)
+                    {
+                        // Update the output vector element
+                        ValueT addend = SemiringT::times(spmv_params.beta, wd_vector_y[tile_start_coord.x + scan_segment[ITEM].key]);
+                        scan_segment[ITEM].value = SemiringT::plus(addend, scan_segment[ITEM].value);
+                    }
+
+                    // Set the output vector element
+                    spmv_params.d_vector_y[tile_start_coord.x + scan_segment[ITEM].key] = scan_segment[ITEM].value;
+                }
+            }
+        }
+
+        // Return the tile's running carry-out
+        return tile_carry;
+    }
+
+
+
+    /**
+     * Consume a merge tile, specialized for indirect load of nonzeros
+     */
+    __device__ __forceinline__ KeyValuePairT ConsumeTile(
+        int             tile_idx,
+        CoordinateT     tile_start_coord,
+        CoordinateT     tile_end_coord,
+        Int2Type<false> is_direct_load)     ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
+    {
+        int         tile_num_rows           = tile_end_coord.x - tile_start_coord.x;
+        int         tile_num_nonzeros       = tile_end_coord.y - tile_start_coord.y;
+
+#if (CUB_PTX_ARCH >= 520)
+
+        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
+        ValueT*     s_tile_nonzeros         = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
+
+        // Gather the nonzeros for the merge tile into shared memory
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS);
+
+            ValueIteratorT a                = wd_values + tile_start_coord.y + nonzero_idx;
+            ColumnIndicesIteratorT ci       = wd_column_indices + tile_start_coord.y + nonzero_idx;
+            ValueT* s                       = s_tile_nonzeros + nonzero_idx;
+
+            if (nonzero_idx < tile_num_nonzeros)
+            {
+
+                OffsetT column_idx          = *ci;
+                ValueT  value               = *a;
+
+                ValueT  vector_value            = spmv_params.t_vector_x[column_idx];
+                vector_value                    = wd_vector_x[column_idx];
+
+                ValueT  nonzero             = SemiringT::times(value, vector_value);
+
+                *s    = nonzero;
+            }
+        }
+
+
+#else
+
+        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
+        ValueT*     s_tile_nonzeros         = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
+
+        // Gather the nonzeros for the merge tile into shared memory
+        if (tile_num_nonzeros > 0)
+        {
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                int     nonzero_idx             = threadIdx.x + (ITEM * BLOCK_THREADS);
+                nonzero_idx                     = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1);
+
+                OffsetT column_idx              = wd_column_indices[tile_start_coord.y + nonzero_idx];
+                ValueT  value                   = wd_values[tile_start_coord.y + nonzero_idx];
+
+// #if (CUB_PTX_ARCH >= 350)
+//                 ValueT vector_value             = wd_vector_x[column_idx];
+// #else
+//                 ValueT vector_value             = spmv_params.t_vector_x[column_idx];
+// #endif
+                ValueT  vector_value            = spmv_params.t_vector_x[column_idx];
+#if (CUB_PTX_ARCH >= 350)
+                vector_value                    = wd_vector_x[column_idx];
+#endif
+                ValueT  nonzero                 = SemiringT::times(value, vector_value);
+
+                s_tile_nonzeros[nonzero_idx]    = nonzero;
+            }
+        }
+
+#endif
+
+        // Gather the row end-offsets for the merge tile into shared memory
+        #pragma unroll 1
+        for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)
+        {
+            s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item];
+        }
+
+        CTA_SYNC();
+
+        // Search for the thread's starting coordinate within the merge tile
+        CountingInputIterator<OffsetT>  tile_nonzero_indices(tile_start_coord.y);
+        CoordinateT                     thread_start_coord;
+
+        MergePathSearch(
+            OffsetT(threadIdx.x * ITEMS_PER_THREAD),    // Diagonal
+            s_tile_row_end_offsets,                     // List A
+            tile_nonzero_indices,                       // List B
+            tile_num_rows,
+            tile_num_nonzeros,
+            thread_start_coord);
+
+        CTA_SYNC();            // Perf-sync
+
+        // Compute the thread's merge path segment
+        CoordinateT     thread_current_coord = thread_start_coord;
+        KeyValuePairT   scan_segment[ITEMS_PER_THREAD];
+        ValueT          running_total = SemiringT::plus_ident();
+
+        OffsetT row_end_offset  = s_tile_row_end_offsets[thread_current_coord.x];
+        ValueT  nonzero         = s_tile_nonzeros[thread_current_coord.y];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
+            {
+                // Move down (accumulate)
+                scan_segment[ITEM].value    = nonzero;
+                running_total               = SemiringT::plus(nonzero, running_total);
+                ++thread_current_coord.y;
+                nonzero                     = s_tile_nonzeros[thread_current_coord.y];
+            }
+            else
+            {
+                // Move right (reset)
+                scan_segment[ITEM].value    = SemiringT::plus_ident();
+                running_total               = SemiringT::plus_ident();
+                ++thread_current_coord.x;
+                row_end_offset              = s_tile_row_end_offsets[thread_current_coord.x];
+            }
+
+            scan_segment[ITEM].key = thread_current_coord.x;
+        }
+
+        CTA_SYNC();
+
+        // Block-wide reduce-value-by-segment
+        KeyValuePairT       tile_carry;
+        ReduceBySegmentOpT  scan_op;
+        KeyValuePairT       scan_item;
+
+        scan_item.value = running_total;
+        scan_item.key = thread_current_coord.x;
+
+        BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);
+
+        if (threadIdx.x == 0)
+        {
+            scan_item.key = thread_start_coord.x;
+            scan_item.value = SemiringT::plus_ident();
+        }
+
+        if (tile_num_rows > 0)
+        {
+
+            CTA_SYNC();
+
+            // Scan downsweep and scatter
+            ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero;
+
+            if (scan_item.key != scan_segment[0].key)
+            {
+                s_partials[scan_item.key] = scan_item.value;
+            }
+            else
+            {
+                scan_segment[0].value = SemiringT::plus(scan_item.value, scan_segment[0].value);
+            }
+
+            #pragma unroll
+            for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                if (scan_segment[ITEM - 1].key != scan_segment[ITEM].key)
+                {
+                    s_partials[scan_segment[ITEM - 1].key] = scan_segment[ITEM - 1].value;
+                }
+                else
+                {
+                    scan_segment[ITEM].value = SemiringT::plus(scan_segment[ITEM].value, scan_segment[ITEM - 1].value);
+                }
+            }
+
+            CTA_SYNC();
+
+            #pragma unroll 1
+            for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS)
+            {
+                if (HAS_ALPHA)
+                {
+                    s_partials[item] = SemiringT::times(s_partials[item], spmv_params.alpha);
+                }
+
+                if (HAS_BETA)
+                {
+                    // Update the output vector element
+                    ValueT addend = SemiringT::times(spmv_params.beta, spmv_params.d_vector_y[tile_start_coord.x + item]);
+                    s_partials[item] = SemiringT::plus(addend, s_partials[item]);
+                }
+                spmv_params.d_vector_y[tile_start_coord.x + item] = s_partials[item];
+            }
+        }
+
+        // Return the tile's running carry-out
+        return tile_carry;
+    }
+
+
+    /**
+     * Consume input tile
+     */
+    __device__ __forceinline__ void ConsumeTile(
+        CoordinateT*    d_tile_coordinates,     ///< [in] Pointer to the temporary array of tile starting coordinates
+        KeyValuePairT*  d_tile_carry_pairs,     ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block
+        int             num_merge_tiles)        ///< [in] Number of merge tiles
+    {
+        int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+
+        if (tile_idx >= num_merge_tiles)
+            return;
+
+        // Read our starting coordinates
+        if (threadIdx.x < 2)
+        {
+            if (d_tile_coordinates == NULL)
+            {
+                // Search our starting coordinates
+                OffsetT                         diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS;
+                CoordinateT                     tile_coord;
+                CountingInputIterator<OffsetT>  nonzero_indices(0);
+
+                // Search the merge path
+                MergePathSearch(
+                    diagonal,
+                    RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
+                    nonzero_indices,
+                    spmv_params.num_rows,
+                    spmv_params.num_nonzeros,
+                    tile_coord);
+
+                temp_storage.tile_coords[threadIdx.x] = tile_coord;
+            }
+            else
+            {
+                temp_storage.tile_coords[threadIdx.x] = d_tile_coordinates[tile_idx + threadIdx.x];
+            }
+        }
+
+        CTA_SYNC();
+
+        CoordinateT tile_start_coord     = temp_storage.tile_coords[0];
+        CoordinateT tile_end_coord       = temp_storage.tile_coords[1];
+
+        // Consume multi-segment tile
+        KeyValuePairT tile_carry = ConsumeTile(
+            tile_idx,
+            tile_start_coord,
+            tile_end_coord,
+            Int2Type<AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS>());
+
+        // Output the tile's carry-out
+        if (threadIdx.x == 0)
+        {
+            if (HAS_ALPHA)
+                tile_carry.value = SemiringT::times(spmv_params.alpha, tile_carry.value);
+
+            tile_carry.key += tile_start_coord.x;
+            d_tile_carry_pairs[tile_idx]    = tile_carry;
+        }
+    }
+
+
+};
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/agent/single_pass_scan_operators.cuh b/cpp/nvgraph/external/cub_semiring/agent/single_pass_scan_operators.cuh
new file mode 100644
index 00000000000..8106e42e108
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/agent/single_pass_scan_operators.cuh
@@ -0,0 +1,815 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Callback operator types for supplying BlockScan prefixes
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../warp/warp_reduce.cuh"
+#include "../util_arch.cuh"
+#include "../util_device.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Prefix functor type for maintaining a running prefix while scanning a
+ * region independent of other thread blocks
+ ******************************************************************************/
+
+/**
+ * Stateful callback operator type for supplying BlockScan prefixes.
+ * Maintains a running prefix that can be applied to consecutive
+ * BlockScan operations.
+ */
+template <
+    typename T,                 ///< BlockScan value type
+    typename ScanOpT>            ///< Wrapped scan operator type
+struct BlockScanRunningPrefixOp
+{
+    ScanOpT     op;                 ///< Wrapped scan operator
+    T           running_total;      ///< Running block-wide prefix
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanRunningPrefixOp(ScanOpT op)
+    :
+        op(op)
+    {}
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanRunningPrefixOp(
+        T starting_prefix,
+        ScanOpT op)
+    :
+        op(op),
+        running_total(starting_prefix)
+    {}
+
+    /**
+     * Prefix callback operator.  Returns the block-wide running_total in thread-0.
+     */
+    __device__ __forceinline__ T operator()(
+        const T &block_aggregate)              ///< The aggregate sum of the BlockScan inputs
+    {
+        T retval = running_total;
+        running_total = op(running_total, block_aggregate);
+        return retval;
+    }
+};
+
+
+/******************************************************************************
+ * Generic tile status interface types for block-cooperative scans
+ ******************************************************************************/
+
+/**
+ * Enumerations of tile status
+ */
+enum ScanTileStatus
+{
+    SCAN_TILE_OOB,          // Out-of-bounds (e.g., padding)
+    SCAN_TILE_INVALID = 99, // Not yet processed
+    SCAN_TILE_PARTIAL,      // Tile aggregate is available
+    SCAN_TILE_INCLUSIVE,    // Inclusive tile prefix is available
+};
+
+
+/**
+ * Tile status interface.
+ */
+template <
+    typename    T,
+    bool        SINGLE_WORD = Traits<T>::PRIMITIVE>
+struct ScanTileState;
+
+
+/**
+ * Tile status interface specialized for scan status and value types
+ * that can be combined into one machine word that can be
+ * read/written coherently in a single access.
+ */
+template <typename T>
+struct ScanTileState<T, true>
+{
+    // Status word type
+    typedef typename If<(sizeof(T) == 8),
+        long long,
+        typename If<(sizeof(T) == 4),
+            int,
+            typename If<(sizeof(T) == 2),
+                short,
+                char>::Type>::Type>::Type StatusWord;
+
+
+    // Unit word type
+    typedef typename If<(sizeof(T) == 8),
+        longlong2,
+        typename If<(sizeof(T) == 4),
+            int2,
+            typename If<(sizeof(T) == 2),
+                int,
+                uchar2>::Type>::Type>::Type TxnWord;
+
+
+    // Device word type
+    struct TileDescriptor
+    {
+        StatusWord  status;
+        T           value;
+    };
+
+
+    // Constants
+    enum
+    {
+        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+    };
+
+
+    // Device storage
+    TxnWord *d_tile_descriptors;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ScanTileState()
+    :
+        d_tile_descriptors(NULL)
+    {}
+
+
+    /// Initializer
+    __host__ __device__ __forceinline__
+    cudaError_t Init(
+        int     /*num_tiles*/,                      ///< [in] Number of tiles
+        void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t  /*temp_storage_bytes*/)             ///< [in] Size in bytes of \t d_temp_storage allocation
+    {
+        d_tile_descriptors = reinterpret_cast<TxnWord*>(d_temp_storage);
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Compute device memory needed for tile status
+     */
+    __host__ __device__ __forceinline__
+    static cudaError_t AllocationSize(
+        int     num_tiles,                          ///< [in] Number of tiles
+        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
+    {
+        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor);       // bytes needed for tile status descriptors
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Initialize (from device)
+     */
+    __device__ __forceinline__ void InitializeStatus(int num_tiles)
+    {
+        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+        TxnWord val = TxnWord();
+        TileDescriptor *descriptor = reinterpret_cast<TileDescriptor*>(&val);
+
+        if (tile_idx < num_tiles)
+        {
+            // Not-yet-set
+            descriptor->status = StatusWord(SCAN_TILE_INVALID);
+            d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val;
+        }
+
+        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+        {
+            // Padding
+            descriptor->status = StatusWord(SCAN_TILE_OOB);
+            d_tile_descriptors[threadIdx.x] = val;
+        }
+    }
+
+
+    /**
+     * Update the specified tile's inclusive value and corresponding status
+     */
+    __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status = SCAN_TILE_INCLUSIVE;
+        tile_descriptor.value = tile_inclusive;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+    }
+
+
+    /**
+     * Update the specified tile's partial value and corresponding status
+     */
+    __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status = SCAN_TILE_PARTIAL;
+        tile_descriptor.value = tile_partial;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+    }
+
+    /**
+     * Wait for the corresponding tile to become non-invalid
+     */
+    __device__ __forceinline__ void WaitForValid(
+        int             tile_idx,
+        StatusWord      &status,
+        T               &value)
+    {
+        TileDescriptor tile_descriptor;
+        do
+        {
+            __threadfence_block(); // prevent hoisting loads from loop
+            TxnWord alias = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+
+        } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
+
+        status = tile_descriptor.status;
+        value = tile_descriptor.value;
+    }
+
+};
+
+
+
+/**
+ * Tile status interface specialized for scan status and value types that
+ * cannot be combined into one machine word.
+ */
+template <typename T>
+struct ScanTileState<T, false>
+{
+    // Status word type
+    typedef char StatusWord;
+
+    // Constants
+    enum
+    {
+        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+    };
+
+    // Device storage
+    StatusWord  *d_tile_status;
+    T           *d_tile_partial;
+    T           *d_tile_inclusive;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ScanTileState()
+    :
+        d_tile_status(NULL),
+        d_tile_partial(NULL),
+        d_tile_inclusive(NULL)
+    {}
+
+
+    /// Initializer
+    __host__ __device__ __forceinline__
+    cudaError_t Init(
+        int     num_tiles,                          ///< [in] Number of tiles
+        void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t  temp_storage_bytes)                 ///< [in] Size in bytes of \t d_temp_storage allocation
+    {
+        cudaError_t error = cudaSuccess;
+        do
+        {
+            void*   allocations[3] = {NULL, NULL, NULL};
+            size_t  allocation_sizes[3];
+
+            allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);           // bytes needed for tile status descriptors
+            allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     // bytes needed for partials
+            allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     // bytes needed for inclusives
+
+            // Compute allocation pointers into the single storage blob
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Alias the offsets
+            d_tile_status       = reinterpret_cast<StatusWord*>(allocations[0]);
+            d_tile_partial      = reinterpret_cast<T*>(allocations[1]);
+            d_tile_inclusive    = reinterpret_cast<T*>(allocations[2]);
+        }
+        while (0);
+
+        return error;
+    }
+
+
+    /**
+     * Compute device memory needed for tile status
+     */
+    __host__ __device__ __forceinline__
+    static cudaError_t AllocationSize(
+        int     num_tiles,                          ///< [in] Number of tiles
+        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
+    {
+        // Specify storage allocation requirements
+        size_t  allocation_sizes[3];
+        allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);         // bytes needed for tile status descriptors
+        allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);   // bytes needed for partials
+        allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);   // bytes needed for inclusives
+
+        // Set the necessary size of the blob
+        void* allocations[3];
+        return CubDebug(AliasTemporaries(NULL, temp_storage_bytes, allocations, allocation_sizes));
+    }
+
+
+    /**
+     * Initialize (from device)
+     */
+    __device__ __forceinline__ void InitializeStatus(int num_tiles)
+    {
+        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+        if (tile_idx < num_tiles)
+        {
+            // Not-yet-set
+            d_tile_status[TILE_STATUS_PADDING + tile_idx] = StatusWord(SCAN_TILE_INVALID);
+        }
+
+        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+        {
+            // Padding
+            d_tile_status[threadIdx.x] = StatusWord(SCAN_TILE_OOB);
+        }
+    }
+
+
+    /**
+     * Update the specified tile's inclusive value and corresponding status
+     */
+    __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
+    {
+        // Update tile inclusive value
+        ThreadStore<STORE_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive);
+
+        // Fence
+        __threadfence();
+
+        // Update tile status
+        ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE));
+    }
+
+
+    /**
+     * Update the specified tile's partial value and corresponding status
+     */
+    __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
+    {
+        // Update tile partial value
+        ThreadStore<STORE_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial);
+
+        // Fence
+        __threadfence();
+
+        // Update tile status
+        ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL));
+    }
+
+    /**
+     * Wait for the corresponding tile to become non-invalid
+     */
+    __device__ __forceinline__ void WaitForValid(
+        int             tile_idx,
+        StatusWord      &status,
+        T               &value)
+    {
+        do {
+            status = ThreadLoad<LOAD_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx);
+
+            __threadfence();    // prevent hoisting loads from loop or loads below above this one
+
+        } while (status == SCAN_TILE_INVALID);
+
+        if (status == StatusWord(SCAN_TILE_PARTIAL)) 
+            value = ThreadLoad<LOAD_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx);
+        else
+            value = ThreadLoad<LOAD_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx);
+    }
+};
+
+
+/******************************************************************************
+ * ReduceByKey tile status interface types for block-cooperative scans
+ ******************************************************************************/
+
+/**
+ * Tile status interface for reduction by key.
+ *
+ */
+template <
+    typename    ValueT,
+    typename    KeyT,
+    bool        SINGLE_WORD = (Traits<ValueT>::PRIMITIVE) && (sizeof(ValueT) + sizeof(KeyT) < 16)>
+struct ReduceByKeyScanTileState;
+
+
+/**
+ * Tile status interface for reduction by key, specialized for scan status and value types that
+ * cannot be combined into one machine word.
+ */
+template <
+    typename    ValueT,
+    typename    KeyT>
+struct ReduceByKeyScanTileState<ValueT, KeyT, false> :
+    ScanTileState<KeyValuePair<KeyT, ValueT> >
+{
+    typedef ScanTileState<KeyValuePair<KeyT, ValueT> > SuperClass;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ReduceByKeyScanTileState() : SuperClass() {}
+};
+
+
+/**
+ * Tile status interface for reduction by key, specialized for scan status and value types that
+ * can be combined into one machine word that can be read/written coherently in a single access.
+ */
+template <
+    typename ValueT,
+    typename KeyT>
+struct ReduceByKeyScanTileState<ValueT, KeyT, true>
+{
+    typedef KeyValuePair<KeyT, ValueT>KeyValuePairT;
+
+    // Constants
+    enum
+    {
+        PAIR_SIZE           = sizeof(ValueT) + sizeof(KeyT),
+        TXN_WORD_SIZE       = 1 << Log2<PAIR_SIZE + 1>::VALUE,
+        STATUS_WORD_SIZE    = TXN_WORD_SIZE - PAIR_SIZE,
+
+        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+    };
+
+    // Status word type
+    typedef typename If<(STATUS_WORD_SIZE == 8),
+        long long,
+        typename If<(STATUS_WORD_SIZE == 4),
+            int,
+            typename If<(STATUS_WORD_SIZE == 2),
+                short,
+                char>::Type>::Type>::Type StatusWord;
+
+    // Status word type
+    typedef typename If<(TXN_WORD_SIZE == 16),
+        longlong2,
+        typename If<(TXN_WORD_SIZE == 8),
+            long long,
+            int>::Type>::Type TxnWord;
+
+    // Device word type (for when sizeof(ValueT) == sizeof(KeyT))
+    struct TileDescriptorBigStatus
+    {
+        KeyT        key;
+        ValueT      value;
+        StatusWord  status;
+    };
+
+    // Device word type (for when sizeof(ValueT) != sizeof(KeyT))
+    struct TileDescriptorLittleStatus
+    {
+        ValueT      value;
+        StatusWord  status;
+        KeyT        key;
+    };
+
+    // Device word type
+    typedef typename If<
+            (sizeof(ValueT) == sizeof(KeyT)),
+            TileDescriptorBigStatus,
+            TileDescriptorLittleStatus>::Type
+        TileDescriptor;
+
+
+    // Device storage
+    TxnWord *d_tile_descriptors;
+
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ReduceByKeyScanTileState()
+    :
+        d_tile_descriptors(NULL)
+    {}
+
+
+    /// Initializer
+    __host__ __device__ __forceinline__
+    cudaError_t Init(
+        int     /*num_tiles*/,                      ///< [in] Number of tiles
+        void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t  /*temp_storage_bytes*/)             ///< [in] Size in bytes of \t d_temp_storage allocation
+    {
+        d_tile_descriptors = reinterpret_cast<TxnWord*>(d_temp_storage);
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Compute device memory needed for tile status
+     */
+    __host__ __device__ __forceinline__
+    static cudaError_t AllocationSize(
+        int     num_tiles,                          ///< [in] Number of tiles
+        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
+    {
+        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor);       // bytes needed for tile status descriptors
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Initialize (from device)
+     */
+    __device__ __forceinline__ void InitializeStatus(int num_tiles)
+    {
+        int             tile_idx    = (blockIdx.x * blockDim.x) + threadIdx.x;
+        TxnWord         val         = TxnWord();
+        TileDescriptor  *descriptor = reinterpret_cast<TileDescriptor*>(&val);
+
+        if (tile_idx < num_tiles)
+        {
+            // Not-yet-set
+            descriptor->status = StatusWord(SCAN_TILE_INVALID);
+            d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val;
+        }
+
+        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+        {
+            // Padding
+            descriptor->status = StatusWord(SCAN_TILE_OOB);
+            d_tile_descriptors[threadIdx.x] = val;
+        }
+    }
+
+
+    /**
+     * Update the specified tile's inclusive value and corresponding status
+     */
+    __device__ __forceinline__ void SetInclusive(int tile_idx, KeyValuePairT tile_inclusive)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status  = SCAN_TILE_INCLUSIVE;
+        tile_descriptor.value   = tile_inclusive.value;
+        tile_descriptor.key     = tile_inclusive.key;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+    }
+
+
+    /**
+     * Update the specified tile's partial value and corresponding status
+     */
+    __device__ __forceinline__ void SetPartial(int tile_idx, KeyValuePairT tile_partial)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status  = SCAN_TILE_PARTIAL;
+        tile_descriptor.value   = tile_partial.value;
+        tile_descriptor.key     = tile_partial.key;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+    }
+
+    /**
+     * Wait for the corresponding tile to become non-invalid
+     */
+    __device__ __forceinline__ void WaitForValid(
+        int                     tile_idx,
+        StatusWord              &status,
+        KeyValuePairT           &value)
+    {
+//        TxnWord         alias           = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+//        TileDescriptor  tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+//
+//        while (tile_descriptor.status == SCAN_TILE_INVALID)
+//        {
+//            __threadfence_block(); // prevent hoisting loads from loop
+//
+//            alias           = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+//            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+//        }
+//
+//        status      = tile_descriptor.status;
+//        value.value = tile_descriptor.value;
+//        value.key   = tile_descriptor.key;
+
+        TileDescriptor tile_descriptor;
+        do
+        {
+            __threadfence_block(); // prevent hoisting loads from loop
+            TxnWord alias = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+
+        } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
+
+        status      = tile_descriptor.status;
+        value.value = tile_descriptor.value;
+        value.key   = tile_descriptor.key;
+    }
+
+};
+
+
+/******************************************************************************
+ * Prefix call-back operator for coupling local block scan within a
+ * block-cooperative scan
+ ******************************************************************************/
+
+/**
+ * Stateful block-scan prefix functor.  Provides the the running prefix for
+ * the current tile by using the call-back warp to wait on on
+ * aggregates/prefixes from predecessor tiles to become available.
+ */
+template <
+    typename    T,
+    typename    ScanOpT,
+    typename    ScanTileStateT,
+    int         PTX_ARCH = CUB_PTX_ARCH>
+struct TilePrefixCallbackOp
+{
+    // Parameterized warp reduce
+    typedef WarpReduce<T, CUB_PTX_WARP_THREADS, PTX_ARCH> WarpReduceT;
+
+    // Temporary storage type
+    struct _TempStorage
+    {
+        typename WarpReduceT::TempStorage   warp_reduce;
+        T                                   exclusive_prefix;
+        T                                   inclusive_prefix;
+        T                                   block_aggregate;
+    };
+
+    // Alias wrapper allowing temporary storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+    // Type of status word
+    typedef typename ScanTileStateT::StatusWord StatusWord;
+
+    // Fields
+    _TempStorage&               temp_storage;       ///< Reference to a warp-reduction instance
+    ScanTileStateT&             tile_status;        ///< Interface to tile status
+    ScanOpT                     scan_op;            ///< Binary scan operator
+    int                         tile_idx;           ///< The current tile index
+    T                           exclusive_prefix;   ///< Exclusive prefix for the tile
+    T                           inclusive_prefix;   ///< Inclusive prefix for the tile
+
+    // Constructor
+    __device__ __forceinline__
+    TilePrefixCallbackOp(
+        ScanTileStateT       &tile_status,
+        TempStorage         &temp_storage,
+        ScanOpT              scan_op,
+        int                 tile_idx)
+    :
+        temp_storage(temp_storage.Alias()),
+        tile_status(tile_status),
+        scan_op(scan_op),
+        tile_idx(tile_idx) {}
+
+
+    // Block until all predecessors within the warp-wide window have non-invalid status
+    __device__ __forceinline__
+    void ProcessWindow(
+        int         predecessor_idx,        ///< Preceding tile index to inspect
+        StatusWord  &predecessor_status,    ///< [out] Preceding tile status
+        T           &window_aggregate)      ///< [out] Relevant partial reduction from this window of preceding tiles
+    {
+        T value;
+        tile_status.WaitForValid(predecessor_idx, predecessor_status, value);
+
+        // Perform a segmented reduction to get the prefix for the current window.
+        // Use the swizzled scan operator because we are now scanning *down* towards thread0.
+
+        int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE));
+        window_aggregate = WarpReduceT(temp_storage.warp_reduce).TailSegmentedReduce(
+            value,
+            tail_flag,
+            SwizzleScanOp<ScanOpT>(scan_op));
+    }
+
+
+    // BlockScan prefix callback functor (called by the first warp)
+    __device__ __forceinline__
+    T operator()(T block_aggregate)
+    {
+
+        // Update our status with our tile-aggregate
+        if (threadIdx.x == 0)
+        {
+            temp_storage.block_aggregate = block_aggregate;
+            tile_status.SetPartial(tile_idx, block_aggregate);
+        }
+
+        int         predecessor_idx = tile_idx - threadIdx.x - 1;
+        StatusWord  predecessor_status;
+        T           window_aggregate;
+
+        // Wait for the warp-wide window of predecessor tiles to become valid
+        ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
+
+        // The exclusive tile prefix starts out as the current window aggregate
+        exclusive_prefix = window_aggregate;
+
+        // Keep sliding the window back until we come across a tile whose inclusive prefix is known
+        while (WARP_ALL((predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)), 0xffffffff))
+        {
+            predecessor_idx -= CUB_PTX_WARP_THREADS;
+
+            // Update exclusive tile prefix with the window prefix
+            ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
+            exclusive_prefix = scan_op(window_aggregate, exclusive_prefix);
+        }
+
+        // Compute the inclusive tile prefix and update the status for this tile
+        if (threadIdx.x == 0)
+        {
+            inclusive_prefix = scan_op(exclusive_prefix, block_aggregate);
+            tile_status.SetInclusive(tile_idx, inclusive_prefix);
+
+            temp_storage.exclusive_prefix = exclusive_prefix;
+            temp_storage.inclusive_prefix = inclusive_prefix;
+        }
+
+        // Return exclusive_prefix
+        return exclusive_prefix;
+    }
+
+    // Get the exclusive prefix stored in temporary storage
+    __device__ __forceinline__
+    T GetExclusivePrefix()
+    {
+        return temp_storage.exclusive_prefix;
+    }
+
+    // Get the inclusive prefix stored in temporary storage
+    __device__ __forceinline__
+    T GetInclusivePrefix()
+    {
+        return temp_storage.inclusive_prefix;
+    }
+
+    // Get the block aggregate stored in temporary storage
+    __device__ __forceinline__
+    T GetBlockAggregate()
+    {
+        return temp_storage.block_aggregate;
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/block/block_adjacent_difference.cuh b/cpp/nvgraph/external/cub_semiring/block/block_adjacent_difference.cuh
new file mode 100644
index 00000000000..1125fe59cea
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/block/block_adjacent_difference.cuh
@@ -0,0 +1,596 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../util_type.cuh"
+#include "../util_ptx.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+template <
+    typename    T,
+    int         BLOCK_DIM_X,
+    int         BLOCK_DIM_Y     = 1,
+    int         BLOCK_DIM_Z     = 1,
+    int         PTX_ARCH        = CUB_PTX_ARCH>
+class BlockAdjacentDifference
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+
+    /// Shared memory storage layout type (last element from each thread's input)
+    struct _TempStorage
+    {
+        T first_items[BLOCK_THREADS];
+        T last_items[BLOCK_THREADS];
+    };
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /// Specialization for when FlagOp has third index param
+    template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
+    struct ApplyOp
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
+        {
+            return flag_op(b, a, idx);
+        }
+    };
+
+    /// Specialization for when FlagOp does not have a third index param
+    template <typename FlagOp>
+    struct ApplyOp<FlagOp, false>
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/)
+        {
+            return flag_op(b, a);
+        }
+    };
+
+    /// Templated unrolling of item comparison (inductive case)
+    template <int ITERATION, int MAX_ITERATIONS>
+    struct Iterate
+    {
+        // Head flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagHeads(
+            int                     linear_tid,
+            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            T                       (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        {
+            preds[ITERATION] = input[ITERATION - 1];
+
+            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[ITERATION],
+                input[ITERATION],
+                (linear_tid * ITEMS_PER_THREAD) + ITERATION);
+
+            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagHeads(linear_tid, flags, input, preds, flag_op);
+        }
+
+        // Tail flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagTails(
+            int                     linear_tid,
+            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        {
+            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITERATION],
+                input[ITERATION + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1);
+
+            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagTails(linear_tid, flags, input, flag_op);
+        }
+
+    };
+
+    /// Templated unrolling of item comparison (termination case)
+    template <int MAX_ITERATIONS>
+    struct Iterate<MAX_ITERATIONS, MAX_ITERATIONS>
+    {
+        // Head flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagHeads(
+            int                     /*linear_tid*/,
+            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            T                       (&/*preds*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
+        {}
+
+        // Tail flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagTails(
+            int                     /*linear_tid*/,
+            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
+        {}
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+public:
+
+    /// \smemstorage{BlockDiscontinuity}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockAdjacentDifference()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockAdjacentDifference(
+        TempStorage &temp_storage)  ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Head flag operations
+     *********************************************************************/
+    //@{
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        if (linear_tid == 0)
+        {
+            // Set flag for first thread-item (preds[0] is undefined)
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+    }
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(head_flags, input, preds, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    {
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
+    }
+
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first item
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_successor_item)                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+    {
+        // Share first item
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = temp_storage.last_items[linear_tid - 1];
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cpp/nvgraph/external/cub_semiring/block/block_discontinuity.cuh b/cpp/nvgraph/external/cub_semiring/block/block_discontinuity.cuh
new file mode 100644
index 00000000000..428882f70ab
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/block/block_discontinuity.cuh
@@ -0,0 +1,1148 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../util_type.cuh"
+#include "../util_ptx.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief The BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. ![](discont_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                The data type to be flagged.
+ * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
+ * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - A set of "head flags" (or "tail flags") is often used to indicate corresponding items
+ *   that differ from their predecessors (or successors).  For example, head flags are convenient
+ *   for demarcating disjoint data segments as part of a segmented scan or reduction.
+ * - \blocked
+ *
+ * \par Performance Considerations
+ * - \granularity
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockDiscontinuity}
+ * \par
+ * The code snippet below illustrates the head flagging of 512 integer items that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+ *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+ *
+ *     // Allocate shared memory for BlockDiscontinuity
+ *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Collectively compute head flags for discontinuities in the segment
+ *     int head_flags[4];
+ *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is
+ * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
+ * The corresponding output \p head_flags in those threads will be
+ * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+ *
+ * \par Performance Considerations
+ * - Incurs zero bank conflicts for most types
+ *
+ */
+template <
+    typename    T,
+    int         BLOCK_DIM_X,
+    int         BLOCK_DIM_Y     = 1,
+    int         BLOCK_DIM_Z     = 1,
+    int         PTX_ARCH        = CUB_PTX_ARCH>
+class BlockDiscontinuity
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+
+    /// Shared memory storage layout type (last element from each thread's input)
+    struct _TempStorage
+    {
+        T first_items[BLOCK_THREADS];
+        T last_items[BLOCK_THREADS];
+    };
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /// Specialization for when FlagOp has third index param
+    template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
+    struct ApplyOp
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
+        {
+            return flag_op(a, b, idx);
+        }
+    };
+
+    /// Specialization for when FlagOp does not have a third index param
+    template <typename FlagOp>
+    struct ApplyOp<FlagOp, false>
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/)
+        {
+            return flag_op(a, b);
+        }
+    };
+
+    /// Templated unrolling of item comparison (inductive case)
+    template <int ITERATION, int MAX_ITERATIONS>
+    struct Iterate
+    {
+        // Head flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagHeads(
+            int                     linear_tid,
+            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            T                       (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        {
+            preds[ITERATION] = input[ITERATION - 1];
+
+            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[ITERATION],
+                input[ITERATION],
+                (linear_tid * ITEMS_PER_THREAD) + ITERATION);
+
+            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagHeads(linear_tid, flags, input, preds, flag_op);
+        }
+
+        // Tail flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagTails(
+            int                     linear_tid,
+            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        {
+            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITERATION],
+                input[ITERATION + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1);
+
+            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagTails(linear_tid, flags, input, flag_op);
+        }
+
+    };
+
+    /// Templated unrolling of item comparison (termination case)
+    template <int MAX_ITERATIONS>
+    struct Iterate<MAX_ITERATIONS, MAX_ITERATIONS>
+    {
+        // Head flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagHeads(
+            int                     /*linear_tid*/,
+            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            T                       (&/*preds*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
+        {}
+
+        // Tail flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagTails(
+            int                     /*linear_tid*/,
+            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
+        {}
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+public:
+
+    /// \smemstorage{BlockDiscontinuity}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockDiscontinuity()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockDiscontinuity(
+        TempStorage &temp_storage)  ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Head flag operations
+     *********************************************************************/
+    //@{
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        if (linear_tid == 0)
+        {
+            // Set flag for first thread-item (preds[0] is undefined)
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+    }
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+    /**
+     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block, for which the first item has no reference and is always flagged.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute head flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
+     * The corresponding output \p head_flags in those threads will be
+     * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(head_flags, input, preds, flag_op);
+    }
+
+
+    /**
+     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
+     *   against \p tile_predecessor_item.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread0 obtain the predecessor item for the entire tile
+     *     int tile_predecessor_item;
+     *     if (threadIdx.x == 0) tile_predecessor_item == ...
+     *
+     *     // Collectively compute head flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagHeads(
+     *         head_flags, thread_data, cub::Inequality(), tile_predecessor_item);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>,
+     * and that \p tile_predecessor_item is \p 0.  The corresponding output \p head_flags in those threads will be
+     * <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    {
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
+    }
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Tail flag operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block, for which the last item has no reference and is always flagged.
+     *
+     * \par
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute tail flags for discontinuities in the segment
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>.
+     * The corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first item
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
+     *   against \p tile_successor_item.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute tail flags for discontinuities in the segment
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         tail_flags, thread_data, cub::Inequality(), tile_successor_item);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
+     * and that \p tile_successor_item is \p 125.  The corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_successor_item)                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+    {
+        // Share first item
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Head & tail flag operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tail_flags, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
+     * and that the tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     * and the corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = temp_storage.last_items[linear_tid - 1];
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
+     *   against \p tile_predecessor_item.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
+     * and that the tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     * and the corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
+     *   against \p tile_predecessor_item.
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread0 obtain the predecessor item for the entire tile
+     *     int tile_predecessor_item;
+     *     if (threadIdx.x == 0) tile_predecessor_item == ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
+     *         thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>,
+     * that the \p tile_predecessor_item is \p 0, and that the
+     * \p tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     * and the corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
+     *   against \p tile_predecessor_item.
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
+     *   against \p tile_successor_item.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread0 obtain the predecessor item for the entire tile
+     *     int tile_predecessor_item;
+     *     if (threadIdx.x == 0) tile_predecessor_item == ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
+     *         thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>,
+     * that the \p tile_predecessor_item is \p 0, and that the
+     * \p tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     * and the corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+
+
+    //@}  end member group
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cpp/nvgraph/external/cub_semiring/block/block_exchange.cuh b/cpp/nvgraph/external/cub_semiring/block/block_exchange.cuh
new file mode 100644
index 00000000000..c0e32fda555
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/block/block_exchange.cuh
@@ -0,0 +1,1248 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../util_ptx.cuh"
+#include "../util_arch.cuh"
+#include "../util_macro.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief The BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. ![](transpose_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                    The data type to be exchanged.
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of items partitioned onto each thread.
+ * \tparam WARP_TIME_SLICING    <b>[optional]</b> When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds.  Yields a smaller memory footprint at the expense of decreased parallelism.  (Default: false)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - It is commonplace for blocks of threads to rearrange data items between
+ *   threads.  For example, the device-accessible memory subsystem prefers access patterns
+ *   where data items are "striped" across threads (where consecutive threads access consecutive items),
+ *   yet most block-wide operations prefer a "blocked" partitioning of items across threads
+ *   (where consecutive items belong to a single thread).
+ * - BlockExchange supports the following types of data exchanges:
+ *   - Transposing between [<em>blocked</em>](index.html#sec5sec3) and [<em>striped</em>](index.html#sec5sec3) arrangements
+ *   - Transposing between [<em>blocked</em>](index.html#sec5sec3) and [<em>warp-striped</em>](index.html#sec5sec3) arrangements
+ *   - Scattering ranked items to a [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *   - Scattering ranked items to a [<em>striped arrangement</em>](index.html#sec5sec3)
+ * - \rowmajor
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockExchange}
+ * \par
+ * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
+ * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+ *
+ *     // Allocate shared memory for BlockExchange
+ *     __shared__ typename BlockExchange::TempStorage temp_storage;
+ *
+ *     // Load a tile of data striped across threads
+ *     int thread_data[4];
+ *     cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
+ *
+ *     // Collectively exchange data into a blocked arrangement across threads
+ *     BlockExchange(temp_storage).StripedToBlocked(thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of striped input \p thread_data across the block of threads is
+ * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt>.
+ * The corresponding output \p thread_data in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+ *
+ * \par Performance Considerations
+ * - Proper device-specific padding ensures zero bank conflicts for most types.
+ *
+ */
+template <
+    typename    InputT,
+    int         BLOCK_DIM_X,
+    int         ITEMS_PER_THREAD,
+    bool        WARP_TIME_SLICING   = false,
+    int         BLOCK_DIM_Y         = 1,
+    int         BLOCK_DIM_Z         = 1,
+    int         PTX_ARCH            = CUB_PTX_ARCH>
+class BlockExchange
+{
+private:
+
+    /******************************************************************************
+     * Constants
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),
+        SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
+
+        TILE_ITEMS                  = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        TIME_SLICES                 = (WARP_TIME_SLICING) ? WARPS : 1,
+
+        TIME_SLICED_THREADS         = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS,
+        TIME_SLICED_ITEMS           = TIME_SLICED_THREADS * ITEMS_PER_THREAD,
+
+        WARP_TIME_SLICED_THREADS    = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
+        WARP_TIME_SLICED_ITEMS      = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD,
+
+        // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)
+        INSERT_PADDING              = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
+        PADDING_ITEMS               = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0,
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Shared memory storage layout type
+    struct __align__(16) _TempStorage
+    {
+        InputT buff[TIME_SLICED_ITEMS + PADDING_ITEMS];
+    };
+
+public:
+
+    /// \smemstorage{BlockExchange}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+private:
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+    unsigned int lane_id;
+    unsigned int warp_id;
+    unsigned int warp_offset;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for no timeslicing.
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<true>  /*time_slicing*/)
+    {
+        InputT temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
+            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
+
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+            }
+
+            CTA_SYNC();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                // Read a strip of items
+                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
+                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
+
+                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+                {
+                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
+                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+                    {
+                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                        temp_items[ITEM] = temp_storage.buff[item_offset];
+                    }
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            output_items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for no timeslicing
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        WARP_SYNC(0xffffffff);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for warp-timeslicing
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<true>  /*time_slicing*/)
+    {
+        if (warp_id == 0)
+        {
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
+                if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                temp_storage.buff[item_offset] = input_items[ITEM];
+            }
+
+            WARP_SYNC(0xffffffff);
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+                if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                output_items[ITEM] = temp_storage.buff[item_offset];
+            }
+        }
+
+        #pragma unroll
+        for (unsigned int SLICE = 1; SLICE < TIME_SLICES; ++SLICE)
+        {
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+
+                WARP_SYNC(0xffffffff);
+
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    output_items[ITEM] = temp_storage.buff[item_offset];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing.
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void StripedToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        // No timeslicing
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void StripedToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<true>  /*time_slicing*/)
+    {
+        // Warp time-slicing
+        InputT temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
+            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
+
+            CTA_SYNC();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                // Write a strip of items
+                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
+                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
+
+                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+                {
+                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
+                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+                    {
+                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                        temp_storage.buff[item_offset] = input_items[ITEM];
+                    }
+                }
+            }
+
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_items[ITEM] = temp_storage.buff[item_offset];
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            output_items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        WARP_SYNC(0xffffffff);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<true>  /*time_slicing*/)
+    {
+        #pragma unroll
+        for (unsigned int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
+        {
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+
+                WARP_SYNC(0xffffffff);
+
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    output_items[ITEM] = temp_storage.buff[item_offset];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for no timeslicing.
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+    /**
+     * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<true>  /*time_slicing*/)
+    {
+        InputT temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            CTA_SYNC();
+
+            const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE;
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = ranks[ITEM] - SLICE_OFFSET;
+                if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
+                {
+                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+            }
+
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
+                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+                    temp_items[ITEM] = temp_storage.buff[item_offset];
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            output_items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+    /**
+     * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for no timeslicing.
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    /**
+     * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<true> /*time_slicing*/)
+    {
+        InputT temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
+            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
+
+            CTA_SYNC();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = ranks[ITEM] - SLICE_OFFSET;
+                if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
+                {
+                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+            }
+
+            CTA_SYNC();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                // Read a strip of items
+                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
+                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
+
+                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+                {
+                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
+                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+                    {
+                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                        temp_items[ITEM] = temp_storage.buff[item_offset];
+                    }
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            output_items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+public:
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockExchange()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        lane_id(LaneId()),
+        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockExchange(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        lane_id(LaneId()),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Structured exchanges
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Load a tile of ordered data into a striped arrangement across block threads
+     *     int thread_data[4];
+     *     cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
+     *
+     *     // Collectively exchange data into a blocked arrangement across threads
+     *     BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of striped input \p thread_data across the block of threads is
+     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> after loading from device-accessible memory.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void StripedToBlocked(
+        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        StripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+    /**
+     * \brief Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively exchange data into a striped arrangement across threads
+     *     BlockExchange(temp_storage).BlockedToStriped(thread_data, thread_data);
+     *
+     *     // Store data striped across block threads into an ordered tile
+     *     cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of blocked input \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> in
+     * preparation for storing to device-accessible memory.
+     *
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToStriped(
+        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        BlockedToStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    /**
+     * \brief Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Load a tile of ordered data into a warp-striped arrangement across warp threads
+     *     int thread_data[4];
+     *     cub::LoadSWarptriped<LOAD_DEFAULT>(threadIdx.x, d_data, thread_data);
+     *
+     *     // Collectively exchange data into a blocked arrangement across threads
+     *     BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of warp-striped input \p thread_data across the block of threads is
+     * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
+     * after loading from device-accessible memory.  (The first 128 items are striped across
+     * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        WarpStripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    /**
+     * \brief Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively exchange data into a warp-striped arrangement across threads
+     *     BlockExchange(temp_storage).BlockedToWarpStriped(thread_data, thread_data);
+     *
+     *     // Store data striped across warp threads into an ordered tile
+     *     cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of blocked input \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
+     * in preparation for storing to device-accessible memory. (The first 128 items are striped across
+     * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
+     *
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        BlockedToWarpStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Scatter exchanges
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Exchanges data items annotated by rank into <em>blocked</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToBlocked(
+        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
+    {
+        ScatterToBlocked(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    /**
+     * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
+    {
+        ScatterToStriped(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    /**
+     * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement.  Items with rank -1 are not exchanged.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToStripedGuarded(
+        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            if (ranks[ITEM] >= 0)
+                temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+
+
+    /**
+     * \brief Exchanges valid data items annotated by rank into <em>striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     * \tparam ValidFlag                            <b>[inferred]</b> FlagT type denoting which items are valid
+     */
+    template <typename OutputT, typename OffsetT, typename ValidFlag>
+    __device__ __forceinline__ void ScatterToStripedFlagged(
+        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD],            ///< [in] Corresponding scatter ranks
+        ValidFlag   is_valid[ITEMS_PER_THREAD])         ///< [in] Corresponding flag denoting item validity
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            if (is_valid[ITEM])
+                temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    //@}  end member group
+
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+    __device__ __forceinline__ void StripedToBlocked(
+        InputT      items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        StripedToBlocked(items, items);
+    }
+
+    __device__ __forceinline__ void BlockedToStriped(
+        InputT      items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        BlockedToStriped(items, items);
+    }
+
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        InputT      items[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        WarpStripedToBlocked(items, items);
+    }
+
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        InputT      items[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        BlockedToWarpStriped(items, items);
+    }
+
+    template <typename OffsetT>
+    __device__ __forceinline__ void ScatterToBlocked(
+        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToBlocked(items, items, ranks);
+    }
+
+    template <typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToStriped(items, items, ranks);
+    }
+
+    template <typename OffsetT>
+    __device__ __forceinline__ void ScatterToStripedGuarded(
+        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToStripedGuarded(items, items, ranks);
+    }
+
+    template <typename OffsetT, typename ValidFlag>
+    __device__ __forceinline__ void ScatterToStripedFlagged(
+        InputT      items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD],        ///< [in] Corresponding scatter ranks
+        ValidFlag   is_valid[ITEMS_PER_THREAD])     ///< [in] Corresponding flag denoting item validity
+    {
+        ScatterToStriped(items, items, ranks, is_valid);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+};
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+template <
+    typename    T,
+    int         ITEMS_PER_THREAD,
+    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
+    int         PTX_ARCH                = CUB_PTX_ARCH>
+class WarpExchange
+{
+private:
+
+    /******************************************************************************
+     * Constants
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        // Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        WARP_ITEMS                  = (ITEMS_PER_THREAD * LOGICAL_WARP_THREADS) + 1,
+
+        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),
+        SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
+
+        // Insert padding if the number of items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)
+        INSERT_PADDING              = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
+        PADDING_ITEMS               = (INSERT_PADDING) ? (WARP_ITEMS >> LOG_SMEM_BANKS) : 0,
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        T buff[WARP_ITEMS + PADDING_ITEMS];
+    };
+
+public:
+
+    /// \smemstorage{WarpExchange}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+private:
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    _TempStorage    &temp_storage;
+    int             lane_id;
+
+public:
+
+    /******************************************************************************
+     * Construction
+     ******************************************************************************/
+
+    /// Constructor
+    __device__ __forceinline__ WarpExchange(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        lane_id(IS_ARCH_WARP ?
+            LaneId() :
+            LaneId() % LOGICAL_WARP_THREADS)
+    {}
+
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+    /**
+     * \brief Exchanges valid data items annotated by rank into <em>striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     */
+    template <typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        T               items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange
+        OffsetT         ranks[ITEMS_PER_THREAD])        ///< [in] Corresponding scatter ranks
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if (INSERT_PADDING) ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]);
+            temp_storage.buff[ranks[ITEM]] = items[ITEM];
+        }
+
+        WARP_SYNC(0xffffffff);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (ITEM * LOGICAL_WARP_THREADS) + lane_id;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+};
+
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/block/block_histogram.cuh b/cpp/nvgraph/external/cub_semiring/block/block_histogram.cuh
new file mode 100644
index 00000000000..5d393c2353f
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/block/block_histogram.cuh
@@ -0,0 +1,415 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "specializations/block_histogram_sort.cuh"
+#include "specializations/block_histogram_atomic.cuh"
+#include "../util_ptx.cuh"
+#include "../util_arch.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+/**
+ * \brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of block-wide histograms.
+ */
+enum BlockHistogramAlgorithm
+{
+
+    /**
+     * \par Overview
+     * Sorting followed by differentiation.  Execution is comprised of two phases:
+     * -# Sort the data using efficient radix sort
+     * -# Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts.
+     *
+     * \par Performance Considerations
+     * Delivers consistent throughput regardless of sample bin distribution.
+     */
+    BLOCK_HISTO_SORT,
+
+
+    /**
+     * \par Overview
+     * Use atomic addition to update byte counts directly
+     *
+     * \par Performance Considerations
+     * Performance is strongly tied to the hardware implementation of atomic
+     * addition, and may be significantly degraded for non uniformly-random
+     * input distributions where many concurrent updates are likely to be
+     * made to the same bin counter.
+     */
+    BLOCK_HISTO_ATOMIC,
+};
+
+
+
+/******************************************************************************
+ * Block histogram
+ ******************************************************************************/
+
+
+/**
+ * \brief The BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. ![](histogram_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                    The sample type being histogrammed (must be castable to an integer bin identifier)
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of items per thread
+ * \tparam BINS                 The number bins within the histogram
+ * \tparam ALGORITHM            <b>[optional]</b> cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
+ *   counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
+ * - BlockHistogram can be optionally specialized to use different algorithms:
+ *   -# <b>cub::BLOCK_HISTO_SORT</b>.  Sorting followed by differentiation. [More...](\ref cub::BlockHistogramAlgorithm)
+ *   -# <b>cub::BLOCK_HISTO_ATOMIC</b>.  Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHistogramAlgorithm)
+ *
+ * \par Performance Considerations
+ * - \granularity
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockHistogram}
+ * \par
+ * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
+ * are partitioned across 128 threads where each thread owns 4 samples.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+ *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+ *
+ *     // Allocate shared memory for BlockHistogram
+ *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+ *
+ *     // Allocate shared memory for block-wide histogram bin counts
+ *     __shared__ unsigned int smem_histogram[256];
+ *
+ *     // Obtain input samples per thread
+ *     unsigned char data[4];
+ *     ...
+ *
+ *     // Compute the block-wide histogram
+ *     BlockHistogram(temp_storage).Histogram(data, smem_histogram);
+ *
+ * \endcode
+ *
+ * \par Performance and Usage Considerations
+ * - The histogram output can be constructed in shared or device-accessible memory
+ * - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives
+ *
+ */
+template <
+    typename                T,
+    int                     BLOCK_DIM_X,
+    int                     ITEMS_PER_THREAD,
+    int                     BINS,
+    BlockHistogramAlgorithm ALGORITHM           = BLOCK_HISTO_SORT,
+    int                     BLOCK_DIM_Y         = 1,
+    int                     BLOCK_DIM_Z         = 1,
+    int                     PTX_ARCH            = CUB_PTX_ARCH>
+class BlockHistogram
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    /**
+     * Ensure the template parameterization meets the requirements of the
+     * targeted device architecture.  BLOCK_HISTO_ATOMIC can only be used
+     * on version SM120 or later.  Otherwise BLOCK_HISTO_SORT is used
+     * regardless.
+     */
+    static const BlockHistogramAlgorithm SAFE_ALGORITHM =
+        ((ALGORITHM == BLOCK_HISTO_ATOMIC) && (PTX_ARCH < 120)) ?
+            BLOCK_HISTO_SORT :
+            ALGORITHM;
+
+    /// Internal specialization.
+    typedef typename If<(SAFE_ALGORITHM == BLOCK_HISTO_SORT),
+        BlockHistogramSort<T, BLOCK_DIM_X, ITEMS_PER_THREAD, BINS, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>,
+        BlockHistogramAtomic<BINS> >::Type InternalBlockHistogram;
+
+    /// Shared memory storage layout type for BlockHistogram
+    typedef typename InternalBlockHistogram::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+public:
+
+    /// \smemstorage{BlockHistogram}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockHistogram()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockHistogram(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Histogram operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Initialize the shared histogram counters to zero.
+     *
+     * \par Snippet
+     * The code snippet below illustrates a the initialization and update of a
+     * histogram of 512 integer samples that are partitioned across 128 threads
+     * where each thread owns 4 samples.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+     *
+     *     // Allocate shared memory for BlockHistogram
+     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+     *
+     *     // Allocate shared memory for block-wide histogram bin counts
+     *     __shared__ unsigned int smem_histogram[256];
+     *
+     *     // Obtain input samples per thread
+     *     unsigned char thread_samples[4];
+     *     ...
+     *
+     *     // Initialize the block-wide histogram
+     *     BlockHistogram(temp_storage).InitHistogram(smem_histogram);
+     *
+     *     // Update the block-wide histogram
+     *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
+     *
+     * \endcode
+     *
+     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
+     */
+    template <typename CounterT     >
+    __device__ __forceinline__ void InitHistogram(CounterT      histogram[BINS])
+    {
+        // Initialize histogram bin counts to zeros
+        int histo_offset = 0;
+
+        #pragma unroll
+        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+        {
+            histogram[histo_offset + linear_tid] = 0;
+        }
+        // Finish up with guarded initialization if necessary
+        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+        {
+            histogram[histo_offset + linear_tid] = 0;
+        }
+    }
+
+
+    /**
+     * \brief Constructs a block-wide histogram in shared/device-accessible memory.  Each thread contributes an array of input elements.
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
+     * are partitioned across 128 threads where each thread owns 4 samples.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+     *
+     *     // Allocate shared memory for BlockHistogram
+     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+     *
+     *     // Allocate shared memory for block-wide histogram bin counts
+     *     __shared__ unsigned int smem_histogram[256];
+     *
+     *     // Obtain input samples per thread
+     *     unsigned char thread_samples[4];
+     *     ...
+     *
+     *     // Compute the block-wide histogram
+     *     BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram);
+     *
+     * \endcode
+     *
+     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
+     */
+    template <
+        typename            CounterT     >
+    __device__ __forceinline__ void Histogram(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        CounterT             histogram[BINS])                ///< [out] Reference to shared/device-accessible memory histogram
+    {
+        // Initialize histogram bin counts to zeros
+        InitHistogram(histogram);
+
+        CTA_SYNC();
+
+        // Composite the histogram
+        InternalBlockHistogram(temp_storage).Composite(items, histogram);
+    }
+
+
+
+    /**
+     * \brief Updates an existing block-wide histogram in shared/device-accessible memory.  Each thread composites an array of input elements.
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a the initialization and update of a
+     * histogram of 512 integer samples that are partitioned across 128 threads
+     * where each thread owns 4 samples.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+     *
+     *     // Allocate shared memory for BlockHistogram
+     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+     *
+     *     // Allocate shared memory for block-wide histogram bin counts
+     *     __shared__ unsigned int smem_histogram[256];
+     *
+     *     // Obtain input samples per thread
+     *     unsigned char thread_samples[4];
+     *     ...
+     *
+     *     // Initialize the block-wide histogram
+     *     BlockHistogram(temp_storage).InitHistogram(smem_histogram);
+     *
+     *     // Update the block-wide histogram
+     *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
+     *
+     * \endcode
+     *
+     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
+     */
+    template <
+        typename            CounterT     >
+    __device__ __forceinline__ void Composite(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        CounterT             histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
+    {
+        InternalBlockHistogram(temp_storage).Composite(items, histogram);
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/block/block_load.cuh b/cpp/nvgraph/external/cub_semiring/block/block_load.cuh
new file mode 100644
index 00000000000..234dad295a0
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/block/block_load.cuh
@@ -0,0 +1,1268 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Operations for reading linear tiles of data into the CUDA thread block.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "block_exchange.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_ptx.cuh"
+#include "../util_macro.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIo
+ * @{
+ */
+
+
+/******************************************************************//**
+ * \name Blocked arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block.
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectBlocked(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
+
+    // Load directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = thread_itr[ITEM];
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range.
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectBlocked(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items)                ///< [in] Number of valid items to load
+{
+    InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
+
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if ((linear_tid * ITEMS_PER_THREAD) + ITEM < valid_items)
+        {
+            items[ITEM] = thread_itr[ITEM];
+        }
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements..
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    typename        DefaultT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectBlocked(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items,                ///< [in] Number of valid items to load
+    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+{
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        items[ITEM] = oob_default;
+
+    LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+}
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Internal implementation for load vectorization
+ */
+template <
+    CacheLoadModifier   MODIFIER,
+    typename            T,
+    int                 ITEMS_PER_THREAD>
+__device__ __forceinline__ void InternalLoadDirectBlockedVectorized(
+    int    linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    T      *block_ptr,                 ///< [in] Input pointer for loading from
+    T      (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    // Biggest memory access word that T is a whole multiple of
+    typedef typename UnitWord<T>::DeviceWord DeviceWord;
+
+    enum
+    {
+        TOTAL_WORDS = sizeof(items) / sizeof(DeviceWord),
+
+        VECTOR_SIZE = (TOTAL_WORDS % 4 == 0) ?
+            4 :
+            (TOTAL_WORDS % 2 == 0) ?
+                2 :
+                1,
+
+        VECTORS_PER_THREAD = TOTAL_WORDS / VECTOR_SIZE,
+    };
+
+    // Vector type
+    typedef typename CubVector<DeviceWord, VECTOR_SIZE>::Type Vector;
+
+    // Vector items
+    Vector vec_items[VECTORS_PER_THREAD];
+
+    // Aliased input ptr
+    Vector* vec_ptr = reinterpret_cast<Vector*>(block_ptr) + (linear_tid * VECTORS_PER_THREAD);
+
+    // Load directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++)
+    {
+        vec_items[ITEM] = ThreadLoad<MODIFIER>(vec_ptr + ITEM);
+    }
+
+    // Copy
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = *(reinterpret_cast<T*>(vec_items) + ITEM);
+    }
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block.
+ *
+ * \blocked
+ *
+ * The input offset (\p block_ptr + \p block_offset) must be quad-item aligned
+ *
+ * The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
+ *   - \p ITEMS_PER_THREAD is odd
+ *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ */
+template <
+    typename        T,
+    int             ITEMS_PER_THREAD>
+__device__ __forceinline__ void LoadDirectBlockedVectorized(
+    int linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    T   *block_ptr,                 ///< [in] Input pointer for loading from
+    T   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
+}
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Striped arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Load a linear segment of items into a striped arrangement across the thread block.
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    int             BLOCK_THREADS,
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    InputIteratorT thread_itr = block_itr + linear_tid;
+
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    int             BLOCK_THREADS,
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items)                ///< [in] Number of valid items to load
+{
+    InputIteratorT thread_itr = block_itr + linear_tid;
+
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (linear_tid + (ITEM * BLOCK_THREADS) < valid_items)
+        {
+            items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+        }
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    int             BLOCK_THREADS,
+    typename        InputT,
+    typename        DefaultT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items,                ///< [in] Number of valid items to load
+    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+{
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        items[ITEM] = oob_default;
+
+    LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Warp-striped arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Load a linear segment of items into a warp-striped arrangement across the thread block.
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectWarpStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    int tid                = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid                = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset        = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    InputIteratorT thread_itr = block_itr + warp_offset + tid ;
+
+    // Load directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)];
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT        <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectWarpStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items)                ///< [in] Number of valid items to load
+{
+    int tid                = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid                = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset        = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    InputIteratorT thread_itr = block_itr + warp_offset + tid ;
+
+    // Load directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
+        {
+            items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)];
+        }
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT        <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    typename        DefaultT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectWarpStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items,                ///< [in] Number of valid items to load
+    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+{
+    // Load directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        items[ITEM] = oob_default;
+
+    LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
+}
+
+
+
+//@}  end member group
+
+/** @} */       // end group UtilIo
+
+
+
+//-----------------------------------------------------------------------------
+// Generic BlockLoad abstraction
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
+ */
+
+/**
+ * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
+ */
+enum BlockLoadAlgorithm
+{
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
+     * directly from memory.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) decreases as the
+     *   access stride between threads increases (i.e., the number items per thread).
+     */
+    BLOCK_LOAD_DIRECT,
+
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
+     * from memory using CUDA's built-in vectorized loads as a coalescing optimization.
+     * For example, <tt>ld.global.v4.s32</tt> instructions will be generated
+     * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high until the the
+     *   access stride between threads (i.e., the number items per thread) exceeds the
+     *   maximum vector load width (typically 4 items or 64B, whichever is lower).
+     * - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
+     *   - \p ITEMS_PER_THREAD is odd
+     *   - The \p InputIteratorTis not a simple pointer type
+     *   - The block input offset is not quadword-aligned
+     *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+     */
+    BLOCK_LOAD_VECTORIZE,
+
+    /**
+     * \par Overview
+     *
+     * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is read
+     * efficiently from memory and then locally transposed into a
+     * [<em>blocked arrangement</em>](index.html#sec5sec3).
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items loaded per thread.
+     * - The local reordering incurs slightly longer latencies and throughput than the
+     *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
+     */
+    BLOCK_LOAD_TRANSPOSE,
+
+
+    /**
+     * \par Overview
+     *
+     * A [<em>warp-striped arrangement</em>](index.html#sec5sec3) of data is
+     * read efficiently from memory and then locally transposed into a
+     * [<em>blocked arrangement</em>](index.html#sec5sec3).
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items loaded per thread.
+     * - The local reordering incurs slightly larger latencies than the
+     *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
+     * - Provisions more shared storage, but incurs smaller latencies than the
+     *   BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative.
+     */
+    BLOCK_LOAD_WARP_TRANSPOSE,
+
+
+    /**
+     * \par Overview
+     *
+     * Like \p BLOCK_LOAD_WARP_TRANSPOSE, a [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+     * of data is read directly from memory and then is locally transposed into a
+     * [<em>blocked arrangement</em>](index.html#sec5sec3). To reduce the shared memory
+     * requirement, only one warp's worth of shared memory is provisioned and is
+     * subsequently time-sliced among warps.
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items loaded per thread.
+     * - Provisions less shared memory temporary storage, but incurs larger
+     *   latencies than the BLOCK_LOAD_WARP_TRANSPOSE alternative.
+     */
+    BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,
+};
+
+
+/**
+ * \brief The BlockLoad class provides [<em>collective</em>](index.html#sec0) data movement methods for loading a linear segment of items from memory into a [<em>blocked arrangement</em>](index.html#sec5sec3) across a CUDA thread block.  ![](block_load_logo.png)
+ * \ingroup BlockModule
+ * \ingroup UtilIo
+ *
+ * \tparam InputT               The data type to read into (which must be convertible from the input iterator's value type).
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
+ * \tparam ALGORITHM            <b>[optional]</b> cub::BlockLoadAlgorithm tuning policy.  default: cub::BLOCK_LOAD_DIRECT.
+ * \tparam WARP_TIME_SLICING    <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - The BlockLoad class provides a single data movement abstraction that can be specialized
+ *   to implement different cub::BlockLoadAlgorithm strategies.  This facilitates different
+ *   performance policies for different architectures, data types, granularity sizes, etc.
+ * - BlockLoad can be optionally specialized by different data movement strategies:
+ *   -# <b>cub::BLOCK_LOAD_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory.  [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory using CUDA's built-in vectorized loads as a
+ *      coalescing optimization.    [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_TRANSPOSE</b>.  A [<em>striped arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory and is then locally transposed into a
+ *      [<em>blocked arrangement</em>](index.html#sec5sec3).  [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE</b>.  A [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory and is then locally transposed into a
+ *      [<em>blocked arrangement</em>](index.html#sec5sec3).  [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,</b>.  A [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory and is then locally transposed into a
+ *      [<em>blocked arrangement</em>](index.html#sec5sec3) one warp at a time.  [More...](\ref cub::BlockLoadAlgorithm)
+ * - \rowmajor
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockLoad}
+ * \par
+ * The code snippet below illustrates the loading of a linear
+ * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+ * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+ * meaning memory references are efficiently coalesced using a warp-striped access
+ * pattern (after which items are locally reordered among threads).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+ *
+ *     // Allocate shared memory for BlockLoad
+ *     __shared__ typename BlockLoad::TempStorage temp_storage;
+ *
+ *     // Load a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     BlockLoad(temp_storage).Load(d_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+ * The set of \p thread_data across the block of threads in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+ *
+ */
+template <
+    typename            InputT,
+    int                 BLOCK_DIM_X,
+    int                 ITEMS_PER_THREAD,
+    BlockLoadAlgorithm  ALGORITHM           = BLOCK_LOAD_DIRECT,
+    int                 BLOCK_DIM_Y         = 1,
+    int                 BLOCK_DIM_Z         = 1,
+    int                 PTX_ARCH            = CUB_PTX_ARCH>
+class BlockLoad
+{
+private:
+
+    /******************************************************************************
+     * Constants and typed definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+
+    /******************************************************************************
+     * Algorithmic variants
+     ******************************************************************************/
+
+    /// Load helper
+    template <BlockLoadAlgorithm _POLICY, int DUMMY>
+    struct LoadInternal;
+
+
+    /**
+     * BLOCK_LOAD_DIRECT specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_DIRECT, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
+        }
+
+    };
+
+
+    /**
+     * BLOCK_LOAD_VECTORIZE specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_VECTORIZE, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputT               *block_ptr,                     ///< [in] The thread block's base input iterator for loading from
+            InputT               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
+        }
+
+        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            const InputT         *block_ptr,                     ///< [in] The thread block's base input iterator for loading from
+            InputT               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
+        }
+
+        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
+        template <
+            CacheLoadModifier   MODIFIER,
+            typename            ValueType,
+            typename            OffsetT>
+        __device__ __forceinline__ void Load(
+            CacheModifiedInputIterator<MODIFIER, ValueType, OffsetT>    block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT                                                     (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            InternalLoadDirectBlockedVectorized<MODIFIER>(linear_tid, block_itr.ptr, items);
+        }
+
+        /// Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization)
+        template <typename _InputIteratorT>
+        __device__ __forceinline__ void Load(
+            _InputIteratorT   block_itr,                    ///< [in] The thread block's base input iterator for loading from
+            InputT           (&items)[ITEMS_PER_THREAD])   ///< [out] Data to load
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range (skips vectorization)
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization)
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT          oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
+        }
+
+    };
+
+
+    /**
+     * BLOCK_LOAD_TRANSPOSE specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_TRANSPOSE, DUMMY>
+    {
+        // BlockExchange utility type for keys
+        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {
+            /// Temporary storage for partially-full block guard
+            volatile int valid_items;
+        };
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+        {
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
+            BlockExchange(temp_storage).StripedToBlocked(items, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items);
+            BlockExchange(temp_storage).StripedToBlocked(items, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items, oob_default);
+            BlockExchange(temp_storage).StripedToBlocked(items, items);
+        }
+
+    };
+
+
+    /**
+     * BLOCK_LOAD_WARP_TRANSPOSE specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {
+            /// Temporary storage for partially-full block guard
+            volatile int valid_items;
+        };
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items, oob_default);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+    };
+
+
+    /**
+     * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {
+            /// Temporary storage for partially-full block guard
+            volatile int valid_items;
+        };
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items, oob_default);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+    };
+
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Internal load implementation to use
+    typedef LoadInternal<ALGORITHM, 0> InternalLoad;
+
+
+    /// Shared memory storage layout type
+    typedef typename InternalLoad::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Thread reference to shared storage
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+public:
+
+    /// \smemstorage{BlockLoad}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockLoad()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockLoad(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Data movement
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Load a linear segment of items from memory.
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the loading of a linear
+     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+     * meaning memory references are efficiently coalesced using a warp-striped access
+     * pattern (after which items are locally reordered among threads).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+     *
+     *     // Allocate shared memory for BlockLoad
+     *     __shared__ typename BlockLoad::TempStorage temp_storage;
+     *
+     *     // Load a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     BlockLoad(temp_storage).Load(d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+     * The set of \p thread_data across the block of threads in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    template <typename InputIteratorT>
+    __device__ __forceinline__ void Load(
+        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+        InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+    {
+        InternalLoad(temp_storage, linear_tid).Load(block_itr, items);
+    }
+
+
+    /**
+     * \brief Load a linear segment of items from memory, guarded by range.
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the guarded loading of a linear
+     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+     * meaning memory references are efficiently coalesced using a warp-striped access
+     * pattern (after which items are locally reordered among threads).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+     * {
+     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+     *
+     *     // Allocate shared memory for BlockLoad
+     *     __shared__ typename BlockLoad::TempStorage temp_storage;
+     *
+     *     // Load a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt> and \p valid_items is \p 5.
+     * The set of \p thread_data across the block of threads in those threads will be
+     * <tt>{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }</tt>, with only the first two threads
+     * being unmasked to load portions of valid data (and other items remaining unassigned).
+     *
+     */
+    template <typename InputIteratorT>
+    __device__ __forceinline__ void Load(
+        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+        InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+        int             valid_items)                ///< [in] Number of valid items to load
+    {
+        InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items);
+    }
+
+
+    /**
+     * \brief Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the guarded loading of a linear
+     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+     * meaning memory references are efficiently coalesced using a warp-striped access
+     * pattern (after which items are locally reordered among threads).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+     * {
+     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+     *
+     *     // Allocate shared memory for BlockLoad
+     *     __shared__ typename BlockLoad::TempStorage temp_storage;
+     *
+     *     // Load a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt>,
+     * \p valid_items is \p 5, and the out-of-bounds default is \p -1.
+     * The set of \p thread_data across the block of threads in those threads will be
+     * <tt>{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }</tt>, with only the first two threads
+     * being unmasked to load portions of valid data (and other items are assigned \p -1)
+     *
+     */
+    template <typename InputIteratorT, typename DefaultT>
+    __device__ __forceinline__ void Load(
+        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+        InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+        int             valid_items,                ///< [in] Number of valid items to load
+        DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+    {
+        InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default);
+    }
+
+
+    //@}  end member group
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/block/block_radix_rank.cuh b/cpp/nvgraph/external/cub_semiring/block/block_radix_rank.cuh
new file mode 100644
index 00000000000..77500ba0ede
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/block/block_radix_rank.cuh
@@ -0,0 +1,697 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "../thread/thread_reduce.cuh"
+#include "../thread/thread_scan.cuh"
+#include "../block/block_scan.cuh"
+#include "../util_ptx.cuh"
+#include "../util_arch.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
+ * \ingroup BlockModule
+ *
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam RADIX_BITS           The number of radix bits per digit place
+ * \tparam IS_DESCENDING           Whether or not the sorted-order is high-to-low
+ * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).  See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
+ * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
+ * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * Blah...
+ * - Keys must be in a form suitable for radix ranking (i.e., unsigned bits).
+ * - \blocked
+ *
+ * \par Performance Considerations
+ * - \granularity
+ *
+ * \par Examples
+ * \par
+ * - <b>Example 1:</b> Simple radix rank of 32-bit integer keys
+ *      \code
+ *      #include <cub/cub.cuh>
+ *
+ *      template <int BLOCK_THREADS>
+ *      __global__ void ExampleKernel(...)
+ *      {
+ *
+ *      \endcode
+ */
+template <
+    int                     BLOCK_DIM_X,
+    int                     RADIX_BITS,
+    bool                    IS_DESCENDING,
+    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
+    cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,
+    int                     BLOCK_DIM_Y             = 1,
+    int                     BLOCK_DIM_Z             = 1,
+    int                     PTX_ARCH                = CUB_PTX_ARCH>
+class BlockRadixRank
+{
+private:
+
+    /******************************************************************************
+     * Type definitions and constants
+     ******************************************************************************/
+
+    // Integer type for digit counters (to be packed into words of type PackedCounters)
+    typedef unsigned short DigitCounter;
+
+    // Integer type for packing DigitCounters into columns of shared memory banks
+    typedef typename If<(SMEM_CONFIG == cudaSharedMemBankSizeEightByte),
+        unsigned long long,
+        unsigned int>::Type PackedCounter;
+
+    enum
+    {
+        // The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        RADIX_DIGITS                = 1 << RADIX_BITS,
+
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        BYTES_PER_COUNTER           = sizeof(DigitCounter),
+        LOG_BYTES_PER_COUNTER       = Log2<BYTES_PER_COUNTER>::VALUE,
+
+        PACKING_RATIO               = sizeof(PackedCounter) / sizeof(DigitCounter),
+        LOG_PACKING_RATIO           = Log2<PACKING_RATIO>::VALUE,
+
+        LOG_COUNTER_LANES           = CUB_MAX((RADIX_BITS - LOG_PACKING_RATIO), 0),                // Always at least one lane
+        COUNTER_LANES               = 1 << LOG_COUNTER_LANES,
+
+        // The number of packed counters per thread (plus one for padding)
+        PADDED_COUNTER_LANES        = COUNTER_LANES + 1,
+        RAKING_SEGMENT              = PADDED_COUNTER_LANES,
+    };
+
+public:
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = CUB_MAX(1, RADIX_DIGITS / BLOCK_THREADS),
+    };
+
+private:
+
+
+    /// BlockScan type
+    typedef BlockScan<
+            PackedCounter,
+            BLOCK_DIM_X,
+            INNER_SCAN_ALGORITHM,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        BlockScan;
+
+
+    /// Shared memory storage layout type for BlockRadixRank
+    struct __align__(16) _TempStorage
+    {
+        union Aliasable
+        {
+            DigitCounter            digit_counters[PADDED_COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
+            PackedCounter           raking_grid[BLOCK_THREADS][RAKING_SEGMENT];
+
+        } aliasable;
+
+        // Storage for scanning local ranks
+        typename BlockScan::TempStorage block_scan;
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+    /// Copy of raking segment, promoted to registers
+    PackedCounter cached_segment[RAKING_SEGMENT];
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /**
+     * Internal storage allocator
+     */
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /**
+     * Performs upsweep raking reduction, returning the aggregate
+     */
+    __device__ __forceinline__ PackedCounter Upsweep()
+    {
+        PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid];
+        PackedCounter *raking_ptr;
+
+        if (MEMOIZE_OUTER_SCAN)
+        {
+            // Copy data into registers
+            #pragma unroll
+            for (int i = 0; i < RAKING_SEGMENT; i++)
+            {
+                cached_segment[i] = smem_raking_ptr[i];
+            }
+            raking_ptr = cached_segment;
+        }
+        else
+        {
+            raking_ptr = smem_raking_ptr;
+        }
+
+        return internal::ThreadReduce<RAKING_SEGMENT>(raking_ptr, Sum());
+    }
+
+
+    /// Performs exclusive downsweep raking scan
+    __device__ __forceinline__ void ExclusiveDownsweep(
+        PackedCounter raking_partial)
+    {
+        PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid];
+
+        PackedCounter *raking_ptr = (MEMOIZE_OUTER_SCAN) ?
+            cached_segment :
+            smem_raking_ptr;
+
+        // Exclusive raking downsweep scan
+        internal::ThreadScanExclusive<RAKING_SEGMENT>(raking_ptr, raking_ptr, Sum(), raking_partial);
+
+        if (MEMOIZE_OUTER_SCAN)
+        {
+            // Copy data back to smem
+            #pragma unroll
+            for (int i = 0; i < RAKING_SEGMENT; i++)
+            {
+                smem_raking_ptr[i] = cached_segment[i];
+            }
+        }
+    }
+
+
+    /**
+     * Reset shared memory digit counters
+     */
+    __device__ __forceinline__ void ResetCounters()
+    {
+        // Reset shared memory digit counters
+        #pragma unroll
+        for (int LANE = 0; LANE < PADDED_COUNTER_LANES; LANE++)
+        {
+            *((PackedCounter*) temp_storage.aliasable.digit_counters[LANE][linear_tid]) = 0;
+        }
+    }
+
+
+    /**
+     * Block-scan prefix callback
+     */
+    struct PrefixCallBack
+    {
+        __device__ __forceinline__ PackedCounter operator()(PackedCounter block_aggregate)
+        {
+            PackedCounter block_prefix = 0;
+
+            // Propagate totals in packed fields
+            #pragma unroll
+            for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++)
+            {
+                block_prefix += block_aggregate << (sizeof(DigitCounter) * 8 * PACKED);
+            }
+
+            return block_prefix;
+        }
+    };
+
+
+    /**
+     * Scan shared memory digit counters.
+     */
+    __device__ __forceinline__ void ScanCounters()
+    {
+        // Upsweep scan
+        PackedCounter raking_partial = Upsweep();
+
+        // Compute exclusive sum
+        PackedCounter exclusive_partial;
+        PrefixCallBack prefix_call_back;
+        BlockScan(temp_storage.block_scan).ExclusiveSum(raking_partial, exclusive_partial, prefix_call_back);
+
+        // Downsweep scan with exclusive partial
+        ExclusiveDownsweep(exclusive_partial);
+    }
+
+public:
+
+    /// \smemstorage{BlockScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixRank()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixRank(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Raking
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Rank keys.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
+        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
+        int             num_bits)                           ///< [in] The number of bits in the current digit
+    {
+        DigitCounter    thread_prefixes[KEYS_PER_THREAD];   // For each key, the count of previous keys in this tile having the same digit
+        DigitCounter*   digit_counters[KEYS_PER_THREAD];    // For each key, the byte-offset of its corresponding digit counter in smem
+
+        // Reset shared memory digit counters
+        ResetCounters();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+        {
+            // Get digit
+            unsigned int digit = BFE(keys[ITEM], current_bit, num_bits);
+
+            // Get sub-counter
+            unsigned int sub_counter = digit >> LOG_COUNTER_LANES;
+
+            // Get counter lane
+            unsigned int counter_lane = digit & (COUNTER_LANES - 1);
+
+            if (IS_DESCENDING)
+            {
+                sub_counter = PACKING_RATIO - 1 - sub_counter;
+                counter_lane = COUNTER_LANES - 1 - counter_lane;
+            }
+
+            // Pointer to smem digit counter
+            digit_counters[ITEM] = &temp_storage.aliasable.digit_counters[counter_lane][linear_tid][sub_counter];
+
+            // Load thread-exclusive prefix
+            thread_prefixes[ITEM] = *digit_counters[ITEM];
+
+            // Store inclusive prefix
+            *digit_counters[ITEM] = thread_prefixes[ITEM] + 1;
+        }
+
+        CTA_SYNC();
+
+        // Scan shared memory counters
+        ScanCounters();
+
+        CTA_SYNC();
+
+        // Extract the local ranks of each key
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+        {
+            // Add in thread block exclusive prefix
+            ranks[ITEM] = thread_prefixes[ITEM] + *digit_counters[ITEM];
+        }
+    }
+
+
+    /**
+     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
+        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
+        int             num_bits,                           ///< [in] The number of bits in the current digit
+        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+    {
+        // Rank keys
+        RankKeys(keys, ranks, current_bit, num_bits);
+
+        // Get the inclusive and exclusive digit totals corresponding to the calling thread.
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                    bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+                // Obtain ex/inclusive digit counts.  (Unfortunately these all reside in the
+                // first counter column, resulting in unavoidable bank conflicts.)
+                unsigned int counter_lane   = (bin_idx & (COUNTER_LANES - 1));
+                unsigned int sub_counter    = bin_idx >> (LOG_COUNTER_LANES);
+
+                exclusive_digit_prefix[track] = temp_storage.aliasable.digit_counters[counter_lane][0][sub_counter];
+            }
+        }
+    }
+};
+
+
+
+
+
+/**
+ * Radix-rank using match.any
+ */
+template <
+    int                     BLOCK_DIM_X,
+    int                     RADIX_BITS,
+    bool                    IS_DESCENDING,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
+    int                     BLOCK_DIM_Y             = 1,
+    int                     BLOCK_DIM_Z             = 1,
+    int                     PTX_ARCH                = CUB_PTX_ARCH>
+class BlockRadixRankMatch
+{
+private:
+
+    /******************************************************************************
+     * Type definitions and constants
+     ******************************************************************************/
+
+    typedef int32_t    RankT;
+    typedef int32_t    DigitCounterT;
+
+    enum
+    {
+        // The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        RADIX_DIGITS                = 1 << RADIX_BITS,
+
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        PADDED_WARPS            = ((WARPS & 0x1) == 0) ?
+                                    WARPS + 1 :
+                                    WARPS,
+
+        COUNTERS                = PADDED_WARPS * RADIX_DIGITS,
+        RAKING_SEGMENT          = (COUNTERS + BLOCK_THREADS - 1) / BLOCK_THREADS,
+        PADDED_RAKING_SEGMENT   = ((RAKING_SEGMENT & 0x1) == 0) ?
+                                    RAKING_SEGMENT + 1 :
+                                    RAKING_SEGMENT,
+    };
+
+public:
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = CUB_MAX(1, RADIX_DIGITS / BLOCK_THREADS),
+    };
+
+private:
+
+    /// BlockScan type
+    typedef BlockScan<
+            DigitCounterT,
+            BLOCK_THREADS,
+            INNER_SCAN_ALGORITHM,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        BlockScanT;
+
+
+    /// Shared memory storage layout type for BlockRadixRank
+    struct __align__(16) _TempStorage
+    {
+        typename BlockScanT::TempStorage            block_scan;
+
+        union __align__(16) Aliasable
+        {
+            volatile DigitCounterT                  warp_digit_counters[RADIX_DIGITS][PADDED_WARPS];
+            DigitCounterT                           raking_grid[BLOCK_THREADS][PADDED_RAKING_SEGMENT];
+
+        } aliasable;
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+
+public:
+
+    /// \smemstorage{BlockScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixRankMatch(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Raking
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Rank keys.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
+        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
+        int             num_bits)                           ///< [in] The number of bits in the current digit
+    {
+        // Initialize shared digit counters
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
+            temp_storage.aliasable.raking_grid[linear_tid][ITEM] = 0;
+
+        CTA_SYNC();
+
+        // Each warp will strip-mine its section of input, one strip at a time
+
+        volatile DigitCounterT  *digit_counters[KEYS_PER_THREAD];
+        uint32_t                lane_id         = LaneId();
+        uint32_t                warp_id         = linear_tid >> LOG_WARP_THREADS;
+        uint32_t                lane_mask_lt    = LaneMaskLt();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+        {
+            // My digit
+            uint32_t digit = BFE(keys[ITEM], current_bit, num_bits);
+
+            if (IS_DESCENDING)
+                digit = RADIX_DIGITS - digit - 1;
+
+            // Mask of peers who have same digit as me
+            uint32_t peer_mask = MatchAny<RADIX_BITS>(digit);
+
+            // Pointer to smem digit counter for this key
+            digit_counters[ITEM] = &temp_storage.aliasable.warp_digit_counters[digit][warp_id];
+
+            // Number of occurrences in previous strips
+            DigitCounterT warp_digit_prefix = *digit_counters[ITEM];
+
+            // Warp-sync
+            WARP_SYNC(0xFFFFFFFF);
+
+            // Number of peers having same digit as me
+            int32_t digit_count = __popc(peer_mask);
+
+            // Number of lower-ranked peers having same digit seen so far
+            int32_t peer_digit_prefix = __popc(peer_mask & lane_mask_lt);
+
+            if (peer_digit_prefix == 0)
+            {
+                // First thread for each digit updates the shared warp counter
+                *digit_counters[ITEM] = DigitCounterT(warp_digit_prefix + digit_count);
+            }
+
+            // Warp-sync
+            WARP_SYNC(0xFFFFFFFF);
+
+            // Number of prior keys having same digit
+            ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix);
+        }
+
+        CTA_SYNC();
+
+        // Scan warp counters
+
+        DigitCounterT scan_counters[PADDED_RAKING_SEGMENT];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
+            scan_counters[ITEM] = temp_storage.aliasable.raking_grid[linear_tid][ITEM];
+
+        BlockScanT(temp_storage.block_scan).ExclusiveSum(scan_counters, scan_counters);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
+            temp_storage.aliasable.raking_grid[linear_tid][ITEM] = scan_counters[ITEM];
+
+        CTA_SYNC();
+
+        // Seed ranks with counter values from previous warps
+        #pragma unroll
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+            ranks[ITEM] += *digit_counters[ITEM];
+    }
+
+
+    /**
+     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
+        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
+        int             num_bits,                           ///< [in] The number of bits in the current digit
+        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+    {
+        RankKeys(keys, ranks, current_bit, num_bits);
+
+        // Get exclusive count for each digit
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                    bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+                exclusive_digit_prefix[track] = temp_storage.aliasable.warp_digit_counters[bin_idx][0];
+            }
+        }
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/cpp/nvgraph/external/cub_semiring/block/block_radix_sort.cuh b/cpp/nvgraph/external/cub_semiring/block/block_radix_sort.cuh
new file mode 100644
index 00000000000..736fbde746a
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/block/block_radix_sort.cuh
@@ -0,0 +1,862 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for radix sorting of items partitioned across a CUDA thread block.
+ */
+
+
+#pragma once
+
+#include "block_exchange.cuh"
+#include "block_radix_rank.cuh"
+#include "../util_ptx.cuh"
+#include "../util_arch.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief The BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for sorting items partitioned across a CUDA thread block using a radix sorting method.  ![](sorting_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam KeyT                 KeyT type
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of items per thread
+ * \tparam ValueT               <b>[optional]</b> ValueT type (default: cub::NullType, which indicates a keys-only sort)
+ * \tparam RADIX_BITS           <b>[optional]</b> The number of radix bits per digit place (default: 4 bits)
+ * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).
+ * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
+ * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
+ *   items into ascending order.  It relies upon a positional representation for
+ *   keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
+ *   characters, etc.) specified from least-significant to most-significant.  For a
+ *   given input sequence of keys and a set of rules specifying a total ordering
+ *   of the symbolic alphabet, the radix sorting method produces a lexicographic
+ *   ordering of those keys.
+ * - BlockRadixSort can sort all of the built-in C++ numeric primitive types, e.g.:
+ *   <tt>unsigned char</tt>, \p int, \p double, etc.  Within each key, the implementation treats fixed-length
+ *   bit-sequences of \p RADIX_BITS as radix digit places.  Although the direct radix sorting
+ *   method can only be applied to unsigned integral types, BlockRadixSort
+ *   is able to sort signed and floating-point types via simple bit-wise transformations
+ *   that ensure lexicographic key ordering.
+ * - \rowmajor
+ *
+ * \par Performance Considerations
+ * - \granularity
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockRadixSort}
+ * \par
+ * The code snippet below illustrates a sort of 512 integer keys that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+ *
+ *     // Allocate shared memory for BlockRadixSort
+ *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_keys[4];
+ *     ...
+ *
+ *     // Collectively sort the keys
+ *     BlockRadixSort(temp_storage).Sort(thread_keys);
+ *
+ *     ...
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_keys across the block of threads is
+ * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+ * corresponding output \p thread_keys in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+ *
+ */
+template <
+    typename                KeyT,
+    int                     BLOCK_DIM_X,
+    int                     ITEMS_PER_THREAD,
+    typename                ValueT                   = NullType,
+    int                     RADIX_BITS              = 4,
+    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
+    cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,
+    int                     BLOCK_DIM_Y             = 1,
+    int                     BLOCK_DIM_Z             = 1,
+    int                     PTX_ARCH                = CUB_PTX_ARCH>
+class BlockRadixSort
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        // The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        // Whether or not there are values to be trucked along with keys
+        KEYS_ONLY                   = Equals<ValueT, NullType>::VALUE,
+    };
+
+    // KeyT traits and unsigned bits type
+    typedef Traits<KeyT>                        KeyTraits;
+    typedef typename KeyTraits::UnsignedBits    UnsignedBits;
+
+    /// Ascending BlockRadixRank utility type
+    typedef BlockRadixRank<
+            BLOCK_DIM_X,
+            RADIX_BITS,
+            false,
+            MEMOIZE_OUTER_SCAN,
+            INNER_SCAN_ALGORITHM,
+            SMEM_CONFIG,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        AscendingBlockRadixRank;
+
+    /// Descending BlockRadixRank utility type
+    typedef BlockRadixRank<
+            BLOCK_DIM_X,
+            RADIX_BITS,
+            true,
+            MEMOIZE_OUTER_SCAN,
+            INNER_SCAN_ALGORITHM,
+            SMEM_CONFIG,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        DescendingBlockRadixRank;
+
+    /// BlockExchange utility type for keys
+    typedef BlockExchange<KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchangeKeys;
+
+    /// BlockExchange utility type for values
+    typedef BlockExchange<ValueT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchangeValues;
+
+    /// Shared memory storage layout type
+    union _TempStorage
+    {
+        typename AscendingBlockRadixRank::TempStorage  asending_ranking_storage;
+        typename DescendingBlockRadixRank::TempStorage descending_ranking_storage;
+        typename BlockExchangeKeys::TempStorage        exchange_keys;
+        typename BlockExchangeValues::TempStorage      exchange_values;
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+    /// Rank keys (specialized for ascending sort)
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&unsigned_keys)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        int             begin_bit,
+        int             pass_bits,
+        Int2Type<false> /*is_descending*/)
+    {
+        AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys(
+            unsigned_keys,
+            ranks,
+            begin_bit,
+            pass_bits);
+    }
+
+    /// Rank keys (specialized for descending sort)
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&unsigned_keys)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        int             begin_bit,
+        int             pass_bits,
+        Int2Type<true>  /*is_descending*/)
+    {
+        DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys(
+            unsigned_keys,
+            ranks,
+            begin_bit,
+            pass_bits);
+    }
+
+    /// ExchangeValues (specialized for key-value sort, to-blocked arrangement)
+    __device__ __forceinline__ void ExchangeValues(
+        ValueT          (&values)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        Int2Type<false> /*is_keys_only*/,
+        Int2Type<true>  /*is_blocked*/)
+    {
+        CTA_SYNC();
+
+        // Exchange values through shared memory in blocked arrangement
+        BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks);
+    }
+
+    /// ExchangeValues (specialized for key-value sort, to-striped arrangement)
+    __device__ __forceinline__ void ExchangeValues(
+        ValueT          (&values)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        Int2Type<false> /*is_keys_only*/,
+        Int2Type<false> /*is_blocked*/)
+    {
+        CTA_SYNC();
+
+        // Exchange values through shared memory in blocked arrangement
+        BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks);
+    }
+
+    /// ExchangeValues (specialized for keys-only sort)
+    template <int IS_BLOCKED>
+    __device__ __forceinline__ void ExchangeValues(
+        ValueT                  (&/*values*/)[ITEMS_PER_THREAD],
+        int                     (&/*ranks*/)[ITEMS_PER_THREAD],
+        Int2Type<true>          /*is_keys_only*/,
+        Int2Type<IS_BLOCKED>    /*is_blocked*/)
+    {}
+
+    /// Sort blocked arrangement
+    template <int DESCENDING, int KEYS_ONLY>
+    __device__ __forceinline__ void SortBlocked(
+        KeyT                    (&keys)[ITEMS_PER_THREAD],          ///< Keys to sort
+        ValueT                  (&values)[ITEMS_PER_THREAD],        ///< Values to sort
+        int                     begin_bit,                          ///< The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                            ///< The past-the-end (most-significant) bit index needed for key comparison
+        Int2Type<DESCENDING>    is_descending,                      ///< Tag whether is a descending-order sort
+        Int2Type<KEYS_ONLY>     is_keys_only)                       ///< Tag whether is keys-only sort
+    {
+        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
+            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
+
+        // Twiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
+        }
+
+        // Radix sorting passes
+        while (true)
+        {
+            int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
+
+            // Rank the blocked keys
+            int ranks[ITEMS_PER_THREAD];
+            RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending);
+            begin_bit += RADIX_BITS;
+
+            CTA_SYNC();
+
+            // Exchange keys through shared memory in blocked arrangement
+            BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
+
+            // Exchange values through shared memory in blocked arrangement
+            ExchangeValues(values, ranks, is_keys_only, Int2Type<true>());
+
+            // Quit if done
+            if (begin_bit >= end_bit) break;
+
+            CTA_SYNC();
+        }
+
+        // Untwiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
+        }
+    }
+
+public:
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /// Sort blocked -> striped arrangement
+    template <int DESCENDING, int KEYS_ONLY>
+    __device__ __forceinline__ void SortBlockedToStriped(
+        KeyT                    (&keys)[ITEMS_PER_THREAD],          ///< Keys to sort
+        ValueT                  (&values)[ITEMS_PER_THREAD],        ///< Values to sort
+        int                     begin_bit,                          ///< The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                            ///< The past-the-end (most-significant) bit index needed for key comparison
+        Int2Type<DESCENDING>    is_descending,                      ///< Tag whether is a descending-order sort
+        Int2Type<KEYS_ONLY>     is_keys_only)                       ///< Tag whether is keys-only sort
+    {
+        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
+            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
+
+        // Twiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
+        }
+
+        // Radix sorting passes
+        while (true)
+        {
+            int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
+
+            // Rank the blocked keys
+            int ranks[ITEMS_PER_THREAD];
+            RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending);
+            begin_bit += RADIX_BITS;
+
+            CTA_SYNC();
+
+            // Check if this is the last pass
+            if (begin_bit >= end_bit)
+            {
+                // Last pass exchanges keys through shared memory in striped arrangement
+                BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, ranks);
+
+                // Last pass exchanges through shared memory in striped arrangement
+                ExchangeValues(values, ranks, is_keys_only, Int2Type<false>());
+
+                // Quit
+                break;
+            }
+
+            // Exchange keys through shared memory in blocked arrangement
+            BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
+
+            // Exchange values through shared memory in blocked arrangement
+            ExchangeValues(values, ranks, is_keys_only, Int2Type<true>());
+
+            CTA_SYNC();
+        }
+
+        // Untwiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
+        }
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    /// \smemstorage{BlockRadixSort}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixSort()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixSort(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Sorting (blocked arrangements)
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Performs an ascending block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys.
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).Sort(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
+     * The corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     */
+    __device__ __forceinline__ void Sort(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        NullType values[ITEMS_PER_THREAD];
+
+        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs an ascending block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values.
+     *
+     * \par
+     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     *   more than one tile of values, simply perform a key-value sort of the keys paired
+     *   with a temporary value array that enumerates the key indices.  The reordered indices
+     *   can then be used as a gather-vector for exchanging other associated tile data through
+     *   shared memory.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive pairs.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void Sort(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+    }
+
+    /**
+     * \brief Performs a descending block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys.
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).Sort(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
+     * The corresponding output \p thread_keys in those threads will be
+     * <tt>{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }</tt>.
+     */
+    __device__ __forceinline__ void SortDescending(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        NullType values[ITEMS_PER_THREAD];
+
+        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs a descending block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values.
+     *
+     * \par
+     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     *   more than one tile of values, simply perform a key-value sort of the keys paired
+     *   with a temporary value array that enumerates the key indices.  The reordered indices
+     *   can then be used as a gather-vector for exchanging other associated tile data through
+     *   shared memory.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive pairs.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortDescending(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Sorting (blocked arrangement -> striped arrangement)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Performs an ascending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive keys.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortBlockedToStriped(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        NullType values[ITEMS_PER_THREAD];
+
+        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs an ascending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par
+     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     *   more than one tile of values, simply perform a key-value sort of the keys paired
+     *   with a temporary value array that enumerates the key indices.  The reordered indices
+     *   can then be used as a gather-vector for exchanging other associated tile data through
+     *   shared memory.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive pairs.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortBlockedToStriped(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs a descending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive keys.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortDescendingBlockedToStriped(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        NullType values[ITEMS_PER_THREAD];
+
+        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs a descending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par
+     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     *   more than one tile of values, simply perform a key-value sort of the keys paired
+     *   with a temporary value array that enumerates the key indices.  The reordered indices
+     *   can then be used as a gather-vector for exchanging other associated tile data through
+     *   shared memory.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive pairs.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortDescendingBlockedToStriped(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    //@}  end member group
+
+};
+
+/**
+ * \example example_block_radix_sort.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/block/block_raking_layout.cuh b/cpp/nvgraph/external/cub_semiring/block/block_raking_layout.cuh
new file mode 100644
index 00000000000..ab6b71036cd
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/block/block_raking_layout.cuh
@@ -0,0 +1,152 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data.
+ */
+
+
+#pragma once
+
+#include "../util_macro.cuh"
+#include "../util_arch.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data.    ![](raking.png)
+ * \ingroup BlockModule
+ *
+ * \par Overview
+ * This type facilitates a shared memory usage pattern where a block of CUDA
+ * threads places elements into shared memory and then reduces the active
+ * parallelism to one "raking" warp of threads for serially aggregating consecutive
+ * sequences of shared items.  Padding is inserted to eliminate bank conflicts
+ * (for most data types).
+ *
+ * \tparam T                        The data type to be exchanged.
+ * \tparam BLOCK_THREADS            The thread block size in threads.
+ * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
+ */
+template <
+    typename    T,
+    int         BLOCK_THREADS,
+    int         PTX_ARCH = CUB_PTX_ARCH>
+struct BlockRakingLayout
+{
+    //---------------------------------------------------------------------
+    // Constants and type definitions
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        /// The total number of elements that need to be cooperatively reduced
+        SHARED_ELEMENTS = BLOCK_THREADS,
+
+        /// Maximum number of warp-synchronous raking threads
+        MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Number of raking elements per warp-synchronous raking thread (rounded up)
+        SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
+
+        /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
+        RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
+
+        /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1)
+        HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0),
+
+        /// Degree of bank conflicts (e.g., 4-way)
+        CONFLICT_DEGREE = (HAS_CONFLICTS) ?
+            (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) :
+            1,
+
+        /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load
+        USE_SEGMENT_PADDING = ((SEGMENT_LENGTH & 1) == 0) && (SEGMENT_LENGTH > 2),
+
+        /// Total number of elements in the raking grid
+        GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + USE_SEGMENT_PADDING),
+
+        /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads)
+        UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0),
+    };
+
+
+    /**
+     * \brief Shared memory storage type
+     */
+    struct __align__(16) _TempStorage
+    {
+        T buff[BlockRakingLayout::GRID_ELEMENTS];
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /**
+     * \brief Returns the location for the calling thread to place data into the grid
+     */
+    static __device__ __forceinline__ T* PlacementPtr(
+        TempStorage &temp_storage,
+        unsigned int linear_tid)
+    {
+        // Offset for partial
+        unsigned int offset = linear_tid;
+
+        // Add in one padding element for every segment
+        if (USE_SEGMENT_PADDING > 0)
+        {
+            offset += offset / SEGMENT_LENGTH;
+        }
+
+        // Incorporating a block of padding partials every shared memory segment
+        return temp_storage.Alias().buff + offset;
+    }
+
+
+    /**
+     * \brief Returns the location for the calling thread to begin sequential raking
+     */
+    static __device__ __forceinline__ T* RakingPtr(
+        TempStorage &temp_storage,
+        unsigned int linear_tid)
+    {
+        return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING));
+    }
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/block/block_reduce.cuh b/cpp/nvgraph/external/cub_semiring/block/block_reduce.cuh
new file mode 100644
index 00000000000..a9de9e71742
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/block/block_reduce.cuh
@@ -0,0 +1,607 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "specializations/block_reduce_raking.cuh"
+#include "specializations/block_reduce_raking_commutative_only.cuh"
+#include "specializations/block_reduce_warp_reductions.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+/**
+ * BlockReduceAlgorithm enumerates alternative algorithms for parallel
+ * reduction across a CUDA thread block.
+ */
+enum BlockReduceAlgorithm
+{
+
+    /**
+     * \par Overview
+     * An efficient "raking" reduction algorithm that only supports commutative
+     * reduction operators (true for most operations, e.g., addition).
+     *
+     * \par
+     * Execution is comprised of three phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more
+     *    than one input each).  Threads in warps other than the first warp place
+     *    their partial reductions into shared memory.
+     * -# Upsweep sequential reduction in shared memory.  Threads within the first
+     *    warp continue to accumulate by raking across segments of shared partial reductions
+     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
+     *
+     * \par
+     * \image html block_reduce.png
+     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE
+     *   and is preferable when the reduction operator is commutative.  This variant
+     *   applies fewer reduction operators  than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
+     *   throughput across the GPU when suitably occupied.  However, turn-around latency may be
+     *   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
+     *   when the GPU is under-occupied.
+     */
+    BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,
+
+
+    /**
+     * \par Overview
+     * An efficient "raking" reduction algorithm that supports commutative
+     * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
+     * operators. \blocked.
+     *
+     * \par
+     * Execution is comprised of three phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more
+     *    than one input each).  Each thread then places the partial reduction
+     *    of its item(s) into shared memory.
+     * -# Upsweep sequential reduction in shared memory.  Threads within a
+     *    single warp rake across segments of shared partial reductions.
+     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
+     *
+     * \par
+     * \image html block_reduce.png
+     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - This variant performs more communication than BLOCK_REDUCE_RAKING
+     *   and is only preferable when the reduction operator is non-commutative.  This variant
+     *   applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
+     *   throughput across the GPU when suitably occupied.  However, turn-around latency may be
+     *   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
+     *   when the GPU is under-occupied.
+     */
+    BLOCK_REDUCE_RAKING,
+
+
+    /**
+     * \par Overview
+     * A quick "tiled warp-reductions" reduction algorithm that supports commutative
+     * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
+     * operators.
+     *
+     * \par
+     * Execution is comprised of four phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more
+     *    than one input each).  Each thread then places the partial reduction
+     *    of its item(s) into shared memory.
+     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
+     *    reduction within each warp.
+     * -# A propagation phase where the warp reduction outputs in each warp are
+     *    updated with the aggregate from each preceding warp.
+     *
+     * \par
+     * \image html block_scan_warpscans.png
+     * <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - This variant applies more reduction operators than BLOCK_REDUCE_RAKING
+     *   or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall
+     *   throughput across the GPU.  However turn-around latency may be lower and
+     *   thus useful when the GPU is under-occupied.
+     */
+    BLOCK_REDUCE_WARP_REDUCTIONS,
+};
+
+
+/******************************************************************************
+ * Block reduce
+ ******************************************************************************/
+
+/**
+ * \brief The BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                Data type being reduced
+ * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
+ * \tparam ALGORITHM        <b>[optional]</b> cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS)
+ * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ *   uses a binary combining operator to compute a single aggregate from a list of input elements.
+ * - \rowmajor
+ * - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles:
+ *   -# <b>cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY</b>.  An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
+ *   -# <b>cub::BLOCK_REDUCE_RAKING</b>.  An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
+ *   -# <b>cub::BLOCK_REDUCE_WARP_REDUCTIONS</b>.  A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
+ *
+ * \par Performance Considerations
+ * - \granularity
+ * - Very efficient (only one synchronization barrier).
+ * - Incurs zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *   - Summation (<b><em>vs.</em></b> generic reduction)
+ *   - \p BLOCK_THREADS is a multiple of the architecture's warp size
+ *   - Every thread has a valid input (i.e., full <b><em>vs.</em></b> partial-tiles)
+ * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockReduce}
+ * \par
+ * The code snippet below illustrates a sum reduction of 512 integer items that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+ *     typedef cub::BlockReduce<int, 128> BlockReduce;
+ *
+ *     // Allocate shared memory for BlockReduce
+ *     __shared__ typename BlockReduce::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Compute the block-wide sum for thread0
+ *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+ *
+ * \endcode
+ *
+ */
+template <
+    typename                T,
+    int                     BLOCK_DIM_X,
+    BlockReduceAlgorithm    ALGORITHM       = BLOCK_REDUCE_WARP_REDUCTIONS,
+    int                     BLOCK_DIM_Y     = 1,
+    int                     BLOCK_DIM_Z     = 1,
+    int                     PTX_ARCH        = CUB_PTX_ARCH>
+class BlockReduce
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    typedef BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>           WarpReductions;
+    typedef BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>    RakingCommutativeOnly;
+    typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>                   Raking;
+
+    /// Internal specialization type
+    typedef typename If<(ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS),
+        WarpReductions,
+        typename If<(ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY),
+            RakingCommutativeOnly,
+            Raking>::Type>::Type InternalBlockReduce;     // BlockReduceRaking
+
+    /// Shared memory storage layout type for BlockReduce
+    typedef typename InternalBlockReduce::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+public:
+
+    /// \smemstorage{BlockReduce}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockReduce()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockReduce(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Generic reductions
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes one input element.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a max reduction of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item
+     *     int thread_data;
+     *     ...
+     *
+     *     // Compute the block-wide max for thread0
+     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
+     *
+     * \endcode
+     *
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T               input,                      ///< [in] Calling thread's input
+        ReductionOp     reduction_op)               ///< [in] Binary reduction functor 
+    {
+        return InternalBlockReduce(temp_storage).template Reduce<true>(input, BLOCK_THREADS, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a max reduction of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Compute the block-wide max for thread0
+     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
+     *
+     * \endcode
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int ITEMS_PER_THREAD,
+        typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T               (&inputs)[ITEMS_PER_THREAD],    ///< [in] Calling thread's input segment
+        ReductionOp     reduction_op)                   ///< [in] Binary reduction functor 
+    {
+        // Reduce partials
+        T partial = internal::ThreadReduce(inputs, reduction_op);
+        return Reduce(partial, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  The first \p num_valid threads each contribute one input element.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a max reduction of a partially-full tile of integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(int num_valid, ...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item
+     *     int thread_data;
+     *     if (threadIdx.x < num_valid) thread_data = ...
+     *
+     *     // Compute the block-wide max for thread0
+     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid);
+     *
+     * \endcode
+     *
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,                  ///< [in] Calling thread's input
+        ReductionOp         reduction_op,           ///< [in] Binary reduction functor 
+        int                 num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
+    {
+        // Determine if we scan skip bounds checking
+        if (num_valid >= BLOCK_THREADS)
+        {
+            return InternalBlockReduce(temp_storage).template Reduce<true>(input, num_valid, reduction_op);
+        }
+        else
+        {
+            return InternalBlockReduce(temp_storage).template Reduce<false>(input, num_valid, reduction_op);
+        }
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Summation reductions
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes one input element.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sum reduction of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item
+     *     int thread_data;
+     *     ...
+     *
+     *     // Compute the block-wide sum for thread0
+     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+     *
+     * \endcode
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T   input)                      ///< [in] Calling thread's input
+    {
+        return InternalBlockReduce(temp_storage).template Sum<true>(input, BLOCK_THREADS);
+    }
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sum reduction of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Compute the block-wide sum for thread0
+     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+     *
+     * \endcode
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ T Sum(
+        T   (&inputs)[ITEMS_PER_THREAD])    ///< [in] Calling thread's input segment
+    {
+        // Reduce partials
+        T partial = internal::ThreadReduce(inputs, cub::Sum());
+        return Sum(partial);
+    }
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  The first \p num_valid threads each contribute one input element.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sum reduction of a partially-full tile of integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(int num_valid, ...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item (up to num_items)
+     *     int thread_data;
+     *     if (threadIdx.x < num_valid)
+     *         thread_data = ...
+     *
+     *     // Compute the block-wide sum for thread0
+     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid);
+     *
+     * \endcode
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T   input,                  ///< [in] Calling thread's input
+        int num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
+    {
+        // Determine if we scan skip bounds checking
+        if (num_valid >= BLOCK_THREADS)
+        {
+            return InternalBlockReduce(temp_storage).template Sum<true>(input, num_valid);
+        }
+        else
+        {
+            return InternalBlockReduce(temp_storage).template Sum<false>(input, num_valid);
+        }
+    }
+
+
+    //@}  end member group
+};
+
+/**
+ * \example example_block_reduce.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/block/block_scan.cuh b/cpp/nvgraph/external/cub_semiring/block/block_scan.cuh
new file mode 100644
index 00000000000..245084cff61
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/block/block_scan.cuh
@@ -0,0 +1,2126 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "specializations/block_scan_raking.cuh"
+#include "specializations/block_scan_warp_scans.cuh"
+#include "../util_arch.cuh"
+#include "../util_type.cuh"
+#include "../util_ptx.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+/**
+ * \brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix scan across a CUDA thread block.
+ */
+enum BlockScanAlgorithm
+{
+
+    /**
+     * \par Overview
+     * An efficient "raking reduce-then-scan" prefix scan algorithm.  Execution is comprised of five phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
+     * -# Upsweep sequential reduction in shared memory.  Threads within a single warp rake across segments of shared partial reductions.
+     * -# A warp-synchronous Kogge-Stone style exclusive scan within the raking warp.
+     * -# Downsweep sequential exclusive scan in shared memory.  Threads within a single warp rake across segments of shared partial reductions, seeded with the warp-scan output.
+     * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
+     *
+     * \par
+     * \image html block_scan_raking.png
+     * <div class="centercaption">\p BLOCK_SCAN_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - Although this variant may suffer longer turnaround latencies when the
+     *   GPU is under-occupied, it can often provide higher overall throughput
+     *   across the GPU when suitably occupied.
+     */
+    BLOCK_SCAN_RAKING,
+
+
+    /**
+     * \par Overview
+     * Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at
+     * the expense of higher register pressure.  Raking threads preserve their
+     * "upsweep" segment of values in registers while performing warp-synchronous
+     * scan, allowing the "downsweep" not to re-read them from shared memory.
+     */
+    BLOCK_SCAN_RAKING_MEMOIZE,
+
+
+    /**
+     * \par Overview
+     * A quick "tiled warpscans" prefix scan algorithm.  Execution is comprised of four phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
+     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp.
+     * -# A propagation phase where the warp scan outputs in each warp are updated with the aggregate from each preceding warp.
+     * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
+     *
+     * \par
+     * \image html block_scan_warpscans.png
+     * <div class="centercaption">\p BLOCK_SCAN_WARP_SCANS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - Although this variant may suffer lower overall throughput across the
+     *   GPU because due to a heavy reliance on inefficient warpscans, it can
+     *   often provide lower turnaround latencies when the GPU is under-occupied.
+     */
+    BLOCK_SCAN_WARP_SCANS,
+};
+
+
+/******************************************************************************
+ * Block scan
+ ******************************************************************************/
+
+/**
+ * \brief The BlockScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. ![](block_scan_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                Data type being scanned
+ * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
+ * \tparam ALGORITHM        <b>[optional]</b> cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_SCAN_RAKING)
+ * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
+ *   produces an output list where each element is computed to be the reduction
+ *   of the elements occurring earlier in the input list.  <em>Prefix sum</em>
+ *   connotes a prefix scan with the addition operator. The term \em inclusive indicates
+ *   that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
+ *   The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
+ *   the <em>i</em><sup>th</sup> output reduction.
+ * - \rowmajor
+ * - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles:
+ *   -# <b>cub::BLOCK_SCAN_RAKING</b>.  An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
+ *   -# <b>cub::BLOCK_SCAN_RAKING_MEMOIZE</b>.  Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional register pressure for intermediate storage. [More...](\ref cub::BlockScanAlgorithm)
+ *   -# <b>cub::BLOCK_SCAN_WARP_SCANS</b>.  A quick (low latency) "tiled warpscans" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
+ *
+ * \par Performance Considerations
+ * - \granularity
+ * - Uses special instructions when applicable (e.g., warp \p SHFL)
+ * - Uses synchronization-free communication between warp lanes when applicable
+ * - Invokes a minimal number of minimal block-wide synchronization barriers (only
+ *   one or two depending on algorithm selection)
+ * - Incurs zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *   - Prefix sum variants (<b><em>vs.</em></b> generic scan)
+ *   - \blocksize
+ * - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockScan}
+ * \par
+ * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockScan for a 1D block of 128 threads on type int
+ *     typedef cub::BlockScan<int, 128> BlockScan;
+ *
+ *     // Allocate shared memory for BlockScan
+ *     __shared__ typename BlockScan::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Collectively compute the block-wide exclusive prefix sum
+ *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is
+ * <tt>{[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}</tt>.
+ * The corresponding output \p thread_data in those threads will be
+ * <tt>{[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}</tt>.
+ *
+ */
+template <
+    typename            T,
+    int                 BLOCK_DIM_X,
+    BlockScanAlgorithm  ALGORITHM       = BLOCK_SCAN_RAKING,
+    int                 BLOCK_DIM_Y     = 1,
+    int                 BLOCK_DIM_Z     = 1,
+    int                 PTX_ARCH        = CUB_PTX_ARCH>
+class BlockScan
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    /**
+     * Ensure the template parameterization meets the requirements of the
+     * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy
+     * cannot be used with thread block sizes not a multiple of the
+     * architectural warp size.
+     */
+    static const BlockScanAlgorithm SAFE_ALGORITHM =
+        ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(PTX_ARCH) != 0)) ?
+            BLOCK_SCAN_RAKING :
+            ALGORITHM;
+
+    typedef BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> WarpScans;
+    typedef BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE), PTX_ARCH> Raking;
+
+    /// Define the delegate type for the desired algorithm
+    typedef typename If<(SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS),
+        WarpScans,
+        Raking>::Type InternalBlockScan;
+
+    /// Shared memory storage layout type for BlockScan
+    typedef typename InternalBlockScan::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Public types
+     ******************************************************************************/
+public:
+
+    /// \smemstorage{BlockScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockScan()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockScan(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix sum operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  The value of 0 is applied as the initial value, and is assigned to \p output in <em>thread</em><sub>0</sub>.
+     *
+     * \par
+     * - \identityzero
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 1, ..., 127</tt>.
+     *
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output)                        ///< [out] Calling thread's output item (may be aliased to \p input)
+    {
+        T initial_value = 0;
+        ExclusiveScan(input, output, initial_value, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  The value of 0 is applied as the initial value, and is assigned to \p output in <em>thread</em><sub>0</sub>.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \identityzero
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 1, ..., 127</tt>.
+     * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads.
+     *
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        T initial_value = 0;
+        ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \identityzero
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide exclusive prefix sum
+     *         BlockScan(temp_storage).ExclusiveSum(
+     *             thread_data, thread_data, prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 1, ..., 127</tt>.
+     * The output for the second segment will be <tt>128, 129, ..., 255</tt>.
+     *
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix sum operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  The value of 0 is applied as the initial value, and is assigned to \p output[0] in <em>thread</em><sub>0</sub>.
+     *
+     * \par
+     * - \identityzero
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD])  ///< [out] Calling thread's output items (may be aliased to \p input)
+    {
+        T initial_value = 0;
+        ExclusiveScan(input, output, initial_value, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  The value of 0 is applied as the initial value, and is assigned to \p output[0] in <em>thread</em><sub>0</sub>.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \identityzero
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                 (&input)[ITEMS_PER_THREAD],       ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],      ///< [out] Calling thread's output items (may be aliased to \p input)
+        T                 &block_aggregate)                 ///< [out] block-wide aggregate reduction of input items
+    {
+        // Reduce consecutive thread items in registers
+        T initial_value = 0;
+        ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \identityzero
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 512 integer items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3)
+     * across 128 threads where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *
+     *         // Collectively compute the block-wide exclusive prefix sum
+     *         int block_aggregate;
+     *         BlockScan(temp_storage.scan).ExclusiveSum(
+     *             thread_data, thread_data, prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 1, 2, 3, ..., 510, 511</tt>.
+     * The output for the second segment will be <tt>512, 513, 514, 515, ..., 1022, 1023</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int ITEMS_PER_THREAD,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                       (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                       (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        BlockPrefixCallbackOp   &block_prefix_callback_op)    ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
+    }
+
+
+
+    //@}  end member group        // Exclusive prefix sums
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               initial_value,                  ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
+        ScanOp          scan_op)                        ///< [in] Binary scan functor 
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
+     * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
+        T               initial_value,      ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
+        ScanOp          scan_op,            ///< [in] Binary scan functor 
+        T               &block_aggregate)   ///< [out] block-wide aggregate reduction of input items
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(INT_MIN);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide exclusive prefix max scan
+     *         BlockScan(temp_storage).ExclusiveScan(
+     *             thread_data, thread_data, INT_MIN, cub::Max(), prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
+     * The output for the second segment will be <tt>126, 128, 128, 130, ..., 252, 254</tt>.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op);
+    }
+
+
+    //@}  end member group        // Inclusive prefix sums
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        T                 initial_value,                ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
+        ScanOp            scan_op)                      ///< [in] Binary scan functor
+    {
+        // Reduce consecutive thread items in registers
+        T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op);
+
+        // Exclusive scan in registers with prefix as seed
+        internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
+     * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        T                 initial_value,                ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
+        ScanOp            scan_op,                      ///< [in] Binary scan functor
+        T                 &block_aggregate)             ///< [out] block-wide aggregate reduction of input items
+    {
+        // Reduce consecutive thread items in registers
+        T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate);
+
+        // Exclusive scan in registers with prefix as seed
+        internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *
+     *         // Collectively compute the block-wide exclusive prefix max scan
+     *         BlockScan(temp_storage.scan).ExclusiveScan(
+     *             thread_data, thread_data, INT_MIN, cub::Max(), prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510</tt>.
+     * The output for the second segment will be <tt>510, 512, 512, 514, 514, 516, ..., 1020, 1022</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD         <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp                   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixCallbackOp    <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp,
+        typename        BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        // Reduce consecutive thread items in registers
+        T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
+
+        // Exclusive scan in registers with prefix as seed
+        internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
+    }
+
+
+    //@}  end member group
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document no-initial-value scans
+
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations (no initial value, single datum per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan functor
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan functor
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate);
+    }
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations (no initial value, multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp            scan_op)                      ///< [in] Binary scan functor
+    {
+        // Reduce consecutive thread items in registers
+        T thread_partial = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_partial, thread_partial, scan_op);
+
+        // Exclusive scan in registers with prefix
+        internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan functor
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        // Reduce consecutive thread items in registers
+        T thread_partial = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
+
+        // Exclusive scan in registers with prefix
+        internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+    }
+
+
+    //@}  end member group
+#endif // DOXYGEN_SHOULD_SKIP_THIS  // Do not document no-initial-value scans
+
+    /******************************************************************//**
+     * \name Inclusive prefix sum operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.
+     *
+     * \par
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>1, 2, ..., 128</tt>.
+     *
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output)                        ///< [out] Calling thread's output item (may be aliased to \p input)
+    {
+        InclusiveScan(input, output, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>1, 2, ..., 128</tt>.
+     * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads.
+     *
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        InclusiveScan(input, output, cub::Sum(), block_aggregate);
+    }
+
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide inclusive prefix sum
+     *         BlockScan(temp_storage).InclusiveSum(
+     *             thread_data, thread_data, prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>1, 2, ..., 128</tt>.
+     * The output for the second segment will be <tt>129, 130, ..., 256</tt>.
+     *
+     * \tparam BlockPrefixCallbackOp          <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveSum(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        InclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix sum operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void InclusiveSum(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD])    ///< [out] Calling thread's output items (may be aliased to \p input)
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveSum(input[0], output[0]);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            Sum scan_op;
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveSum(thread_prefix, thread_prefix);
+
+            // Inclusive scan in registers with prefix as seed
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be
+     * <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
+     * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void InclusiveSum(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveSum(input[0], output[0], block_aggregate);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            Sum scan_op;
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveSum(thread_prefix, thread_prefix, block_aggregate);
+
+            // Inclusive scan in registers with prefix as seed
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 512 integer items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3)
+     * across 128 threads where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *
+     *         // Collectively compute the block-wide inclusive prefix sum
+     *         BlockScan(temp_storage.scan).IncluisveSum(
+     *             thread_data, thread_data, prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>1, 2, 3, 4, ..., 511, 512</tt>.
+     * The output for the second segment will be <tt>513, 514, 515, 516, ..., 1023, 1024</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int ITEMS_PER_THREAD,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveSum(
+        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveSum(input[0], output[0], block_prefix_callback_op);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            Sum scan_op;
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveSum(thread_prefix, thread_prefix, block_prefix_callback_op);
+
+            // Inclusive scan in registers with prefix as seed
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
+        }
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix scan operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan functor 
+    {
+        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
+     * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan functor 
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(INT_MIN);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide inclusive prefix max scan
+     *         BlockScan(temp_storage).InclusiveScan(
+     *             thread_data, thread_data, cub::Max(), prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
+     * The output for the second segment will be <tt>128, 128, 130, 130, ..., 254, 254</tt>.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix scan operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan functor 
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveScan(input[0], output[0], scan_op);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveScan(thread_prefix, thread_prefix, scan_op);
+
+            // Inclusive scan in registers with prefix as seed (first thread does not seed)
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
+     * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename         ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan functor 
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveScan(input[0], output[0], scan_op, block_aggregate);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan (with no initial value)
+            ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_aggregate);
+
+            // Inclusive scan in registers with prefix as seed (first thread does not seed)
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *
+     *         // Collectively compute the block-wide inclusive prefix max scan
+     *         BlockScan(temp_storage.scan).InclusiveScan(
+     *             thread_data, thread_data, cub::Max(), prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 0, 2, 2, 4, 4, ..., 510, 510</tt>.
+     * The output for the second segment will be <tt>512, 512, 514, 514, 516, 516, ..., 1022, 1022</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD         <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp                   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixCallbackOp    <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp,
+        typename        BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveScan(input[0], output[0], scan_op, block_prefix_callback_op);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
+
+            // Inclusive scan in registers with prefix as seed
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
+        }
+    }
+
+    //@}  end member group
+
+
+};
+
+/**
+ * \example example_block_scan.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/block/block_shuffle.cuh b/cpp/nvgraph/external/cub_semiring/block/block_shuffle.cuh
new file mode 100644
index 00000000000..504f00e3552
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/block/block_shuffle.cuh
@@ -0,0 +1,305 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockShuffle class provides [<em>collective</em>](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../util_arch.cuh"
+#include "../util_ptx.cuh"
+#include "../util_macro.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief The BlockShuffle class provides [<em>collective</em>](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block.
+ * \ingroup BlockModule
+ *
+ * \tparam T                    The data type to be exchanged.
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * It is commonplace for blocks of threads to rearrange data items between
+ * threads.  The BlockShuffle abstraction allows threads to efficiently shift items
+ * either (a) up to their successor or (b) down to their predecessor.
+ *
+ */
+template <
+    typename            T,
+    int                 BLOCK_DIM_X,
+    int                 BLOCK_DIM_Y         = 1,
+    int                 BLOCK_DIM_Z         = 1,
+    int                 PTX_ARCH            = CUB_PTX_ARCH>
+class BlockShuffle
+{
+private:
+
+    /******************************************************************************
+     * Constants
+     ******************************************************************************/
+
+    enum
+    {
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Shared memory storage layout type (last element from each thread's input)
+    struct _TempStorage
+    {
+        T prev[BLOCK_THREADS];
+        T next[BLOCK_THREADS];
+    };
+
+
+public:
+
+    /// \smemstorage{BlockShuffle}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+private:
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+public:
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockShuffle()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockShuffle(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Shuffle movement
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>. The offset \p distance may be negative.
+     *
+     * \par
+     * - \smemreuse
+     */
+    __device__ __forceinline__ void Offset(
+        T   input,                  ///< [in] The input item from the calling thread (<em>thread<sub>i</sub></em>)
+        T&  output,                 ///< [out] The \p input item from the successor (or predecessor) thread <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub> (may be aliased to \p input).  This value is only updated for for <em>thread<sub>i</sub></em> when 0 <= (<em>i</em> + \p distance) < <tt>BLOCK_THREADS-1</tt>
+        int distance = 1)           ///< [in] Offset distance (may be negative)
+    {
+        temp_storage[linear_tid].prev = input;
+
+        CTA_SYNC();
+
+        if ((linear_tid + distance >= 0) && (linear_tid + distance < BLOCK_THREADS))
+            output = temp_storage[linear_tid + distance].prev;
+    }
+
+
+    /**
+     * \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>.
+     *
+     * \par
+     * - \smemreuse
+     */
+    __device__ __forceinline__ void Rotate(
+        T   input,                  ///< [in] The calling thread's input item
+        T&  output,                 ///< [out] The \p input item from thread <em>thread</em><sub>(<em>i</em>+<tt>distance></tt>)%<tt><BLOCK_THREADS></tt></sub> (may be aliased to \p input).  This value is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>
+        unsigned int distance = 1)  ///< [in] Offset distance (0 < \p distance < <tt>BLOCK_THREADS</tt>)
+    {
+        temp_storage[linear_tid].prev = input;
+
+        CTA_SYNC();
+
+        unsigned int offset = threadIdx.x + distance;
+        if (offset >= BLOCK_THREADS)
+            offset -= BLOCK_THREADS;
+
+        output = temp_storage[offset].prev;
+    }
+
+
+    /**
+     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Up(
+        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+        T (&prev)[ITEMS_PER_THREAD])    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
+    {
+        temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
+            prev[ITEM] = input[ITEM - 1];
+
+
+        if (linear_tid > 0)
+            prev[0] = temp_storage[linear_tid - 1].prev;
+    }
+
+
+    /**
+     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item.  All threads receive the \p input provided by <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>.
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Up(
+        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+        T (&prev)[ITEMS_PER_THREAD],    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
+        T &block_suffix)                ///< [out] The item \p input[ITEMS_PER_THREAD-1] from <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>, provided to all threads
+    {
+        Up(input, prev);
+        block_suffix = temp_storage[BLOCK_THREADS - 1].prev;
+    }
+
+
+    /**
+     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it down by one item
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Down(
+        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+        T (&prev)[ITEMS_PER_THREAD])    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The value \p prev[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
+    {
+        temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
+            prev[ITEM] = input[ITEM - 1];
+
+        if (linear_tid > 0)
+            prev[0] = temp_storage[linear_tid - 1].prev;
+    }
+
+
+    /**
+     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of input items, shifting it down by one item.  All threads receive \p input[0] provided by <em>thread</em><sub><tt>0</tt></sub>.
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Down(
+        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+        T (&prev)[ITEMS_PER_THREAD],    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The value \p prev[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
+        T &block_prefix)                ///< [out] The item \p input[0] from <em>thread</em><sub><tt>0</tt></sub>, provided to all threads
+    {
+        Up(input, prev);
+        block_prefix = temp_storage[BLOCK_THREADS - 1].prev;
+    }
+
+    //@}  end member group
+
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/block/block_store.cuh b/cpp/nvgraph/external/cub_semiring/block/block_store.cuh
new file mode 100644
index 00000000000..63039afa8e5
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/block/block_store.cuh
@@ -0,0 +1,1000 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Operations for writing linear segments of data from the CUDA thread block
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "block_exchange.cuh"
+#include "../util_ptx.cuh"
+#include "../util_macro.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIo
+ * @{
+ */
+
+
+/******************************************************************//**
+ * \name Blocked arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items.
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectBlocked(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+    OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
+
+    // Store directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        thread_itr[ITEM] = items[ITEM];
+    }
+}
+
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectBlocked(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+    int                 valid_items)                ///< [in] Number of valid items to write
+{
+    OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
+
+    // Store directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items)
+        {
+            thread_itr[ITEM] = items[ITEM];
+        }
+    }
+}
+
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items.
+ *
+ * \blocked
+ *
+ * The output offset (\p block_ptr + \p block_offset) must be quad-item aligned,
+ * which is the default starting offset returned by \p cudaMalloc()
+ *
+ * \par
+ * The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT:
+ *   - \p ITEMS_PER_THREAD is odd
+ *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ *
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD>
+__device__ __forceinline__ void StoreDirectBlockedVectorized(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    T                   *block_ptr,                 ///< [in] Input pointer for storing from
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+    enum
+    {
+        // Maximum CUDA vector size is 4 elements
+        MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
+
+        // Vector size must be a power of two and an even divisor of the items per thread
+        VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
+            MAX_VEC_SIZE :
+            1,
+
+        VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
+    };
+
+    // Vector type
+    typedef typename CubVector<T, VEC_SIZE>::Type Vector;
+
+    // Alias global pointer
+    Vector *block_ptr_vectors = reinterpret_cast<Vector*>(const_cast<T*>(block_ptr));
+
+    // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling)
+    Vector raw_vector[VECTORS_PER_THREAD];
+    T *raw_items = reinterpret_cast<T*>(raw_vector);
+
+    // Copy
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        raw_items[ITEM] = items[ITEM];
+    }
+
+    // Direct-store using vector types
+    StoreDirectBlocked(linear_tid, block_ptr_vectors, raw_vector);
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Striped arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Store a striped arrangement of data across the thread block into a linear segment of items.
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    int                 BLOCK_THREADS,
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+    OutputIteratorT thread_itr = block_itr + linear_tid;
+
+    // Store directly in striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
+    }
+}
+
+
+/**
+ * \brief Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    int                 BLOCK_THREADS,
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+    int                 valid_items)                ///< [in] Number of valid items to write
+{
+    OutputIteratorT thread_itr = block_itr + linear_tid;
+
+    // Store directly in striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items)
+        {
+            thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
+        }
+    }
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Warp-striped arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items.
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectWarpStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    OutputIteratorT thread_itr = block_itr + warp_offset + tid;
+
+    // Store directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
+    }
+}
+
+
+/**
+ * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectWarpStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+    int                 valid_items)                ///< [in] Number of valid items to write
+{
+    int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    OutputIteratorT thread_itr = block_itr + warp_offset + tid;
+
+    // Store directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
+        {
+            thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
+        }
+    }
+}
+
+
+//@}  end member group
+
+
+/** @} */       // end group UtilIo
+
+
+//-----------------------------------------------------------------------------
+// Generic BlockStore abstraction
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory.
+ */
+enum BlockStoreAlgorithm
+{
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
+     * directly to memory.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) decreases as the
+     *   access stride between threads increases (i.e., the number items per thread).
+     */
+    BLOCK_STORE_DIRECT,
+
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written directly
+     * to memory using CUDA's built-in vectorized stores as a coalescing optimization.
+     * For example, <tt>st.global.v4.s32</tt> instructions will be generated
+     * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high until the the
+     *   access stride between threads (i.e., the number items per thread) exceeds the
+     *   maximum vector store width (typically 4 items or 64B, whichever is lower).
+     * - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT:
+     *   - \p ITEMS_PER_THREAD is odd
+     *   - The \p OutputIteratorT is not a simple pointer type
+     *   - The block output offset is not quadword-aligned
+     *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+     */
+    BLOCK_STORE_VECTORIZE,
+
+    /**
+     * \par Overview
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
+     * transposed and then efficiently written to memory as a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items written per thread.
+     * - The local reordering incurs slightly longer latencies and throughput than the
+     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
+     */
+    BLOCK_STORE_TRANSPOSE,
+
+    /**
+     * \par Overview
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
+     * transposed and then efficiently written to memory as a
+     * [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items written per thread.
+     * - The local reordering incurs slightly longer latencies and throughput than the
+     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
+     */
+    BLOCK_STORE_WARP_TRANSPOSE,
+
+    /**
+     * \par Overview
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
+     * transposed and then efficiently written to memory as a
+     * [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+     * To reduce the shared memory requirement, only one warp's worth of shared
+     * memory is provisioned and is subsequently time-sliced among warps.
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items written per thread.
+     * - Provisions less shared memory temporary storage, but incurs larger
+     *   latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative.
+     */
+    BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
+
+};
+
+
+/**
+ * \brief The BlockStore class provides [<em>collective</em>](index.html#sec0) data movement methods for writing a [<em>blocked arrangement</em>](index.html#sec5sec3) of items partitioned across a CUDA thread block to a linear segment of memory.  ![](block_store_logo.png)
+ * \ingroup BlockModule
+ * \ingroup UtilIo
+ *
+ * \tparam T                    The type of data to be written.
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
+ * \tparam ALGORITHM            <b>[optional]</b> cub::BlockStoreAlgorithm tuning policy enumeration.  default: cub::BLOCK_STORE_DIRECT.
+ * \tparam WARP_TIME_SLICING    <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - The BlockStore class provides a single data movement abstraction that can be specialized
+ *   to implement different cub::BlockStoreAlgorithm strategies.  This facilitates different
+ *   performance policies for different architectures, data types, granularity sizes, etc.
+ * - BlockStore can be optionally specialized by different data movement strategies:
+ *   -# <b>cub::BLOCK_STORE_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
+ *      directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      of data is written directly to memory using CUDA's built-in vectorized stores as a
+ *      coalescing optimization.  [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      is locally transposed into a [<em>striped arrangement</em>](index.html#sec5sec3) which is
+ *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_WARP_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      is locally transposed into a [<em>warp-striped arrangement</em>](index.html#sec5sec3) which is
+ *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
+ * - \rowmajor
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockStore}
+ * \par
+ * The code snippet below illustrates the storing of a "blocked" arrangement
+ * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+ * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+ * meaning items are locally reordered among threads so that memory references will be
+ * efficiently coalesced using a warp-striped access pattern.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+ *
+ *     // Allocate shared memory for BlockStore
+ *     __shared__ typename BlockStore::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Store items to linear memory
+ *     int thread_data[4];
+ *     BlockStore(temp_storage).Store(d_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of \p thread_data across the block of threads is
+ * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+ * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+ *
+ */
+template <
+    typename                T,
+    int                     BLOCK_DIM_X,
+    int                     ITEMS_PER_THREAD,
+    BlockStoreAlgorithm     ALGORITHM           = BLOCK_STORE_DIRECT,
+    int                     BLOCK_DIM_Y         = 1,
+    int                     BLOCK_DIM_Z         = 1,
+    int                     PTX_ARCH            = CUB_PTX_ARCH>
+class BlockStore
+{
+private:
+    /******************************************************************************
+     * Constants and typed definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+
+    /******************************************************************************
+     * Algorithmic variants
+     ******************************************************************************/
+
+    /// Store helper
+    template <BlockStoreAlgorithm _POLICY, int DUMMY>
+    struct StoreInternal;
+
+
+    /**
+     * BLOCK_STORE_DIRECT specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_DIRECT, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_VECTORIZE specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_VECTORIZE, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization)
+        __device__ __forceinline__ void Store(
+            T                   *block_ptr,                 ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreDirectBlockedVectorized(linear_tid, block_ptr, items);
+        }
+
+        /// Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization)
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_TRANSPOSE specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_TRANSPOSE, DUMMY>
+    {
+        // BlockExchange utility type for keys
+        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {
+            /// Temporary storage for partially-full block guard
+            volatile int valid_items;
+        };
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            BlockExchange(temp_storage).BlockedToStriped(items);
+            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            BlockExchange(temp_storage).BlockedToStriped(items);
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_WARP_TRANSPOSE specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {
+            /// Temporary storage for partially-full block guard
+            volatile int valid_items;
+        };
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                    ///< [in] The thread block's base output iterator for storing to
+            T                 (&items)[ITEMS_PER_THREAD])   ///< [in] Data to store
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            StoreDirectWarpStriped(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                    ///< [in] The thread block's base output iterator for storing to
+            T                 (&items)[ITEMS_PER_THREAD],   ///< [in] Data to store
+            int               valid_items)                  ///< [in] Number of valid items to write
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {
+            /// Temporary storage for partially-full block guard
+            volatile int valid_items;
+        };
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            StoreDirectWarpStriped(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
+        }
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Internal load implementation to use
+    typedef StoreInternal<ALGORITHM, 0> InternalStore;
+
+
+    /// Shared memory storage layout type
+    typedef typename InternalStore::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Thread reference to shared storage
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+public:
+
+
+    /// \smemstorage{BlockStore}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockStore()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockStore(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Data movement
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Store items into a linear segment of memory.
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the storing of a "blocked" arrangement
+     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+     * meaning items are locally reordered among threads so that memory references will be
+     * efficiently coalesced using a warp-striped access pattern.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+     *
+     *     // Allocate shared memory for BlockStore
+     *     __shared__ typename BlockStore::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Store items to linear memory
+     *     int thread_data[4];
+     *     BlockStore(temp_storage).Store(d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+     *
+     */
+    template <typename OutputIteratorT>
+    __device__ __forceinline__ void Store(
+        OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+        T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+    {
+        InternalStore(temp_storage, linear_tid).Store(block_itr, items);
+    }
+
+    /**
+     * \brief Store items into a linear segment of memory, guarded by range.
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the guarded storing of a "blocked" arrangement
+     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+     * meaning items are locally reordered among threads so that memory references will be
+     * efficiently coalesced using a warp-striped access pattern.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+     * {
+     *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+     *
+     *     // Allocate shared memory for BlockStore
+     *     __shared__ typename BlockStore::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Store items to linear memory
+     *     int thread_data[4];
+     *     BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt> and \p valid_items is \p 5.
+     * The output \p d_data will be <tt>0, 1, 2, 3, 4, ?, ?, ?, ...</tt>, with
+     * only the first two threads being unmasked to store portions of valid data.
+     *
+     */
+    template <typename OutputIteratorT>
+    __device__ __forceinline__ void Store(
+        OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+        T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+        int                 valid_items)                ///< [in] Number of valid items to write
+    {
+        InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/block/specializations/block_histogram_atomic.cuh b/cpp/nvgraph/external/cub_semiring/block/specializations/block_histogram_atomic.cuh
new file mode 100644
index 00000000000..4599c092568
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/block/specializations/block_histogram_atomic.cuh
@@ -0,0 +1,82 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+template <int BINS>
+struct BlockHistogramAtomic
+{
+    /// Shared memory storage layout type
+    struct TempStorage {};
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockHistogramAtomic(
+        TempStorage &temp_storage)
+    {}
+
+
+    /// Composite data onto an existing histogram
+    template <
+        typename            T,
+        typename            CounterT,     
+        int                 ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Composite(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        CounterT             histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
+    {
+        // Update histogram
+        #pragma unroll
+        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+        {
+              atomicAdd(histogram + items[i], 1);
+        }
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/block/specializations/block_histogram_sort.cuh b/cpp/nvgraph/external/cub_semiring/block/specializations/block_histogram_sort.cuh
new file mode 100644
index 00000000000..b9ad6fb79c5
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/block/specializations/block_histogram_sort.cuh
@@ -0,0 +1,226 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../block/block_radix_sort.cuh"
+#include "../../block/block_discontinuity.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/**
+ * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+template <
+    typename    T,                  ///< Sample type
+    int         BLOCK_DIM_X,        ///< The thread block length in threads along the X dimension
+    int         ITEMS_PER_THREAD,   ///< The number of samples per thread
+    int         BINS,               ///< The number of bins into which histogram samples may fall
+    int         BLOCK_DIM_Y,        ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,        ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>           ///< The PTX compute capability for which to to specialize this collective
+struct BlockHistogramSort
+{
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    // Parameterize BlockRadixSort type for our thread block
+    typedef BlockRadixSort<
+            T,
+            BLOCK_DIM_X,
+            ITEMS_PER_THREAD,
+            NullType,
+            4,
+            (PTX_ARCH >= 350) ? true : false,
+            BLOCK_SCAN_WARP_SCANS,
+            cudaSharedMemBankSizeFourByte,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        BlockRadixSortT;
+
+    // Parameterize BlockDiscontinuity type for our thread block
+    typedef BlockDiscontinuity<
+            T,
+            BLOCK_DIM_X,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        BlockDiscontinuityT;
+
+    /// Shared memory
+    union _TempStorage
+    {
+        // Storage for sorting bin values
+        typename BlockRadixSortT::TempStorage sort;
+
+        struct
+        {
+            // Storage for detecting discontinuities in the tile of sorted bin values
+            typename BlockDiscontinuityT::TempStorage flag;
+
+            // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
+            unsigned int run_begin[BINS];
+            unsigned int run_end[BINS];
+        };
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    unsigned int linear_tid;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockHistogramSort(
+        TempStorage     &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    // Discontinuity functor
+    struct DiscontinuityOp
+    {
+        // Reference to temp_storage
+        _TempStorage &temp_storage;
+
+        // Constructor
+        __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) :
+            temp_storage(temp_storage)
+        {}
+
+        // Discontinuity predicate
+        __device__ __forceinline__ bool operator()(const T &a, const T &b, int b_index)
+        {
+            if (a != b)
+            {
+                // Note the begin/end offsets in shared storage
+                temp_storage.run_begin[b] = b_index;
+                temp_storage.run_end[a] = b_index;
+
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+    };
+
+
+    // Composite data onto an existing histogram
+    template <
+        typename            CounterT     >
+    __device__ __forceinline__ void Composite(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        CounterT            histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
+    {
+        enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
+
+        // Sort bytes in blocked arrangement
+        BlockRadixSortT(temp_storage.sort).Sort(items);
+
+        CTA_SYNC();
+
+        // Initialize the shared memory's run_begin and run_end for each bin
+        int histo_offset = 0;
+
+        #pragma unroll
+        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+        {
+            temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
+            temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
+        }
+        // Finish up with guarded initialization if necessary
+        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+        {
+            temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
+            temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
+        }
+
+        CTA_SYNC();
+
+        int flags[ITEMS_PER_THREAD];    // unused
+
+        // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile
+        DiscontinuityOp flag_op(temp_storage);
+        BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op);
+
+        // Update begin for first item
+        if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0;
+
+        CTA_SYNC();
+
+        // Composite into histogram
+        histo_offset = 0;
+
+        #pragma unroll
+        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+        {
+            int thread_offset = histo_offset + linear_tid;
+            CounterT      count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
+            histogram[thread_offset] += count;
+        }
+
+        // Finish up with guarded composition if necessary
+        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+        {
+            int thread_offset = histo_offset + linear_tid;
+            CounterT      count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
+            histogram[thread_offset] += count;
+        }
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/block/specializations/block_reduce_raking.cuh b/cpp/nvgraph/external/cub_semiring/block/specializations/block_reduce_raking.cuh
new file mode 100644
index 00000000000..c2c26651796
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/block/specializations/block_reduce_raking.cuh
@@ -0,0 +1,222 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ */
+
+#pragma once
+
+#include "../../block/block_raking_layout.cuh"
+#include "../../warp/warp_reduce.cuh"
+#include "../../thread/thread_reduce.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ *
+ * Supports non-commutative binary reduction operators.  Unlike commutative
+ * reduction operators (e.g., addition), the application of a non-commutative
+ * reduction operator (e.g, string concatenation) across a sequence of inputs must
+ * honor the relative ordering of items and partial reductions when applying the
+ * reduction operator.
+ *
+ * Compared to the implementation of BlockReduceRaking (which does not support
+ * non-commutative operators), this implementation requires a few extra
+ * rounds of inter-thread communication.
+ */
+template <
+    typename    T,              ///< Data type being reduced
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockReduceRaking
+{
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    /// Layout type for padded thread block raking grid
+    typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout;
+
+    ///  WarpReduce utility type
+    typedef typename WarpReduce<T, BlockRakingLayout::RAKING_THREADS, PTX_ARCH>::InternalWarpReduce WarpReduce;
+
+    /// Constants
+    enum
+    {
+        /// Number of raking threads
+        RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
+
+        /// Number of raking elements per warp synchronous raking thread
+        SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
+
+        /// Cooperative work can be entirely warp synchronous
+        WARP_SYNCHRONOUS = (RAKING_THREADS == BLOCK_THREADS),
+
+        /// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two
+        WARP_SYNCHRONOUS_UNGUARDED = PowerOfTwo<RAKING_THREADS>::VALUE,
+
+        /// Whether or not accesses into smem are unguarded
+        RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED,
+
+    };
+
+
+    /// Shared memory storage layout type
+    union _TempStorage
+    {
+        typename WarpReduce::TempStorage            warp_storage;        ///< Storage for warp-synchronous reduction
+        typename BlockRakingLayout::TempStorage     raking_grid;         ///< Padded thread block raking grid
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    unsigned int linear_tid;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockReduceRaking(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    template <bool IS_FULL_TILE, typename ReductionOp, int ITERATION>
+    __device__ __forceinline__ T RakingReduction(
+        ReductionOp                 reduction_op,       ///< [in] Binary scan operator
+        T                           *raking_segment,
+        T                           partial,            ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        Int2Type<ITERATION>         /*iteration*/)
+    {
+        // Update partial if addend is in range
+        if ((IS_FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITERATION < num_valid))
+        {
+            T addend = raking_segment[ITERATION];
+            partial = reduction_op(partial, addend);
+        }
+        return RakingReduction<IS_FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<ITERATION + 1>());
+    }
+
+    template <bool IS_FULL_TILE, typename ReductionOp>
+    __device__ __forceinline__ T RakingReduction(
+        ReductionOp                 /*reduction_op*/,   ///< [in] Binary scan operator
+        T                           * /*raking_segment*/,
+        T                           partial,            ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                         /*num_valid*/,      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        Int2Type<SEGMENT_LENGTH>    /*iteration*/)
+    {
+        return partial;
+    }
+
+
+
+    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <
+        bool                IS_FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two)
+            partial = WarpReduce(temp_storage.warp_storage).template Reduce<IS_FULL_TILE, SEGMENT_LENGTH>(
+                partial,
+                num_valid,
+                reduction_op);
+        }
+        else
+        {
+            // Place partial into shared memory grid.
+            *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial;
+
+            CTA_SYNC();
+
+            // Reduce parallelism to one warp
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking reduction in grid
+                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+                partial = raking_segment[0];
+
+                partial = RakingReduction<IS_FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<1>());
+
+                partial = WarpReduce(temp_storage.warp_storage).template Reduce<IS_FULL_TILE && RAKING_UNGUARDED, SEGMENT_LENGTH>(
+                    partial,
+                    num_valid,
+                    reduction_op);
+
+            }
+        }
+
+        return partial;
+    }
+
+
+    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <bool IS_FULL_TILE>
+    __device__ __forceinline__ T Sum(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        cub::Sum reduction_op;
+
+        return Reduce<IS_FULL_TILE>(partial, num_valid, reduction_op);
+    }
+
+
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/block/specializations/block_reduce_raking_commutative_only.cuh b/cpp/nvgraph/external/cub_semiring/block/specializations/block_reduce_raking_commutative_only.cuh
new file mode 100644
index 00000000000..ee2294607e9
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/block/specializations/block_reduce_raking_commutative_only.cuh
@@ -0,0 +1,199 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block.  Does not support non-commutative reduction operators.
+ */
+
+#pragma once
+
+#include "block_reduce_raking.cuh"
+#include "../../warp/warp_reduce.cuh"
+#include "../../thread/thread_reduce.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block.  Does not support non-commutative reduction operators.  Does not support block sizes that are not a multiple of the warp size.
+ */
+template <
+    typename    T,              ///< Data type being reduced
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockReduceRakingCommutativeOnly
+{
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have valid values
+    typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> FallBack;
+
+    /// Constants
+    enum
+    {
+        /// Number of warp threads
+        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+        /// Whether or not to use fall-back
+        USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)),
+
+        /// Number of raking threads
+        RAKING_THREADS = WARP_THREADS,
+
+        /// Number of threads actually sharing items with the raking threads
+        SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS),
+
+        /// Number of raking elements per warp synchronous raking thread
+        SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS,
+    };
+
+    ///  WarpReduce utility type
+    typedef WarpReduce<T, RAKING_THREADS, PTX_ARCH> WarpReduce;
+
+    /// Layout type for padded thread block raking grid
+    typedef BlockRakingLayout<T, SHARING_THREADS, PTX_ARCH> BlockRakingLayout;
+
+    /// Shared memory storage layout type
+    union _TempStorage
+    {
+        struct
+        {
+            typename WarpReduce::TempStorage        warp_storage;        ///< Storage for warp-synchronous reduction
+            typename BlockRakingLayout::TempStorage raking_grid;         ///< Padded thread block raking grid
+        };
+        typename FallBack::TempStorage              fallback_storage;    ///< Fall-back storage for non-commutative block scan
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    unsigned int linear_tid;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockReduceRakingCommutativeOnly(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <bool FULL_TILE>
+    __device__ __forceinline__ T Sum(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        if (USE_FALLBACK || !FULL_TILE)
+        {
+            return FallBack(temp_storage.fallback_storage).template Sum<FULL_TILE>(partial, num_valid);
+        }
+        else
+        {
+            // Place partial into shared memory grid
+            if (linear_tid >= RAKING_THREADS)
+                *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
+
+            CTA_SYNC();
+
+            // Reduce parallelism to one warp
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking reduction in grid
+                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+                partial = internal::ThreadReduce<SEGMENT_LENGTH>(raking_segment, cub::Sum(), partial);
+
+                // Warpscan
+                partial = WarpReduce(temp_storage.warp_storage).Sum(partial);
+            }
+        }
+
+        return partial;
+    }
+
+
+    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <
+        bool                FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        if (USE_FALLBACK || !FULL_TILE)
+        {
+            return FallBack(temp_storage.fallback_storage).template Reduce<FULL_TILE>(partial, num_valid, reduction_op);
+        }
+        else
+        {
+            // Place partial into shared memory grid
+            if (linear_tid >= RAKING_THREADS)
+                *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
+
+            CTA_SYNC();
+
+            // Reduce parallelism to one warp
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking reduction in grid
+                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+                partial = internal::ThreadReduce<SEGMENT_LENGTH>(raking_segment, reduction_op, partial);
+
+                // Warpscan
+                partial = WarpReduce(temp_storage.warp_storage).Reduce(partial, reduction_op);
+            }
+        }
+
+        return partial;
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/block/specializations/block_reduce_warp_reductions.cuh b/cpp/nvgraph/external/cub_semiring/block/specializations/block_reduce_warp_reductions.cuh
new file mode 100644
index 00000000000..68495b4e77e
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/block/specializations/block_reduce_warp_reductions.cuh
@@ -0,0 +1,222 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ */
+
+#pragma once
+
+#include "../../warp/warp_reduce.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_arch.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ */
+template <
+    typename    T,              ///< Data type being reduced
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockReduceWarpReductions
+{
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        /// Number of warp threads
+        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+        /// Number of active warps
+        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        /// The logical warp size for warp reductions
+        LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
+
+        /// Whether or not the logical warp size evenly divides the thread block size
+        EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0)
+    };
+
+
+    ///  WarpReduce utility type
+    typedef typename WarpReduce<T, LOGICAL_WARP_SIZE, PTX_ARCH>::InternalWarpReduce WarpReduce;
+
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        typename WarpReduce::TempStorage    warp_reduce[WARPS];                ///< Buffer for warp-synchronous scan
+        T                                   warp_aggregates[WARPS];     ///< Shared totals from each warp-synchronous scan
+        T                                   block_prefix;               ///< Shared prefix for the entire thread block
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    unsigned int linear_tid;
+    unsigned int warp_id;
+    unsigned int lane_id;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockReduceWarpReductions(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        lane_id(LaneId())
+    {}
+
+
+    template <bool FULL_TILE, typename ReductionOp, int SUCCESSOR_WARP>
+    __device__ __forceinline__ T ApplyWarpAggregates(
+        ReductionOp                 reduction_op,       ///< [in] Binary scan operator
+        T                           warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        Int2Type<SUCCESSOR_WARP>    /*successor_warp*/)
+    {
+        if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid))
+        {
+            T addend = temp_storage.warp_aggregates[SUCCESSOR_WARP];
+            warp_aggregate = reduction_op(warp_aggregate, addend);
+        }
+        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<SUCCESSOR_WARP + 1>());
+    }
+
+    template <bool FULL_TILE, typename ReductionOp>
+    __device__ __forceinline__ T ApplyWarpAggregates(
+        ReductionOp         /*reduction_op*/,   ///< [in] Binary scan operator
+        T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                 /*num_valid*/,      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        Int2Type<WARPS>     /*successor_warp*/)
+    {
+        return warp_aggregate;
+    }
+
+
+    /// Returns block-wide aggregate in <em>thread</em><sub>0</sub>.
+    template <
+        bool                FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T ApplyWarpAggregates(
+        ReductionOp         reduction_op,       ///< [in] Binary scan operator
+        T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        // Share lane aggregates
+        if (lane_id == 0)
+        {
+            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
+        }
+
+        CTA_SYNC();
+
+        // Update total aggregate in warp 0, lane 0
+        if (linear_tid == 0)
+        {
+            warp_aggregate = ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<1>());
+        }
+
+        return warp_aggregate;
+    }
+
+
+    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <bool FULL_TILE>
+    __device__ __forceinline__ T Sum(
+        T                   input,          ///< [in] Calling thread's input partial reductions
+        int                 num_valid)      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        cub::Sum        reduction_op;
+        unsigned int    warp_offset = warp_id * LOGICAL_WARP_SIZE;
+        unsigned int    warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ?
+                            LOGICAL_WARP_SIZE :
+                            (warp_offset < num_valid) ?
+                                num_valid - warp_offset :
+                                0;
+
+        // Warp reduction in every warp
+        T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>(
+            input,
+            warp_num_valid,
+            cub::Sum());
+
+        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
+        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
+    }
+
+
+    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <
+        bool                FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,              ///< [in] Calling thread's input partial reductions
+        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        unsigned int    warp_offset = warp_id * LOGICAL_WARP_SIZE;
+        unsigned int    warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ?
+                            LOGICAL_WARP_SIZE :
+                            (warp_offset < static_cast<unsigned int>(num_valid)) ?
+                                num_valid - warp_offset :
+                                0;
+
+        // Warp reduction in every warp
+        T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>(
+            input,
+            warp_num_valid,
+            reduction_op);
+
+        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
+        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/block/specializations/block_scan_raking.cuh b/cpp/nvgraph/external/cub_semiring/block/specializations/block_scan_raking.cuh
new file mode 100644
index 00000000000..2e21324c9ee
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/block/specializations/block_scan_raking.cuh
@@ -0,0 +1,666 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+
+/**
+ * \file
+ * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../util_ptx.cuh"
+#include "../../util_arch.cuh"
+#include "../../block/block_raking_layout.cuh"
+#include "../../thread/thread_reduce.cuh"
+#include "../../thread/thread_scan.cuh"
+#include "../../warp/warp_scan.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
+ */
+template <
+    typename    T,              ///< Data type being scanned
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    bool        MEMOIZE,        ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockScanRaking
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    /// Layout type for padded thread block raking grid
+    typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout;
+
+    /// Constants
+    enum
+    {
+        /// Number of raking threads
+        RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
+
+        /// Number of raking elements per warp synchronous raking thread
+        SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
+
+        /// Cooperative work can be entirely warp synchronous
+        WARP_SYNCHRONOUS = (BLOCK_THREADS == RAKING_THREADS),
+    };
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, RAKING_THREADS, PTX_ARCH> WarpScan;
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        typename WarpScan::TempStorage              warp_scan;          ///< Buffer for warp-synchronous scan
+        typename BlockRakingLayout::TempStorage     raking_grid;        ///< Padded thread block raking grid
+        T                                           block_aggregate;    ///< Block aggregate
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    unsigned int    linear_tid;
+    T               cached_segment[SEGMENT_LENGTH];
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    /// Templated reduction
+    template <int ITERATION, typename ScanOp>
+    __device__ __forceinline__ T GuardedReduce(
+        T*                  raking_ptr,         ///< [in] Input array
+        ScanOp              scan_op,            ///< [in] Binary reduction operator
+        T                   raking_partial,     ///< [in] Prefix to seed reduction with
+        Int2Type<ITERATION> /*iteration*/)
+    {
+        if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + ITERATION) < BLOCK_THREADS))
+        {
+            T addend = raking_ptr[ITERATION];
+            raking_partial = scan_op(raking_partial, addend);
+        }
+
+        return GuardedReduce(raking_ptr, scan_op, raking_partial, Int2Type<ITERATION + 1>());
+    }
+
+
+    /// Templated reduction (base case)
+    template <typename ScanOp>
+    __device__ __forceinline__ T GuardedReduce(
+        T*                          /*raking_ptr*/,    ///< [in] Input array
+        ScanOp                      /*scan_op*/,       ///< [in] Binary reduction operator
+        T                           raking_partial,    ///< [in] Prefix to seed reduction with
+        Int2Type<SEGMENT_LENGTH>    /*iteration*/)
+    {
+        return raking_partial;
+    }
+
+
+    /// Templated copy
+    template <int ITERATION>
+    __device__ __forceinline__ void CopySegment(
+        T*                  out,            ///< [out] Out array
+        T*                  in,             ///< [in] Input array
+        Int2Type<ITERATION> /*iteration*/)
+    {
+        out[ITERATION] = in[ITERATION];
+        CopySegment(out, in, Int2Type<ITERATION + 1>());
+    }
+
+ 
+    /// Templated copy (base case)
+    __device__ __forceinline__ void CopySegment(
+        T*                  /*out*/,            ///< [out] Out array
+        T*                  /*in*/,             ///< [in] Input array
+        Int2Type<SEGMENT_LENGTH> /*iteration*/)
+    {}
+
+
+    /// Performs upsweep raking reduction, returning the aggregate
+    template <typename ScanOp>
+    __device__ __forceinline__ T Upsweep(
+        ScanOp scan_op)
+    {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+        // Read data into registers
+        CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
+
+        T raking_partial = cached_segment[0];
+
+        return GuardedReduce(cached_segment, scan_op, raking_partial, Int2Type<1>());
+    }
+
+
+    /// Performs exclusive downsweep raking scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveDownsweep(
+        ScanOp          scan_op,
+        T               raking_partial,
+        bool            apply_prefix = true)
+    {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+        // Read data back into registers
+        if (!MEMOIZE)
+        {
+            CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
+        }
+
+        internal::ThreadScanExclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
+
+        // Write data back to smem
+        CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
+    }
+
+
+    /// Performs inclusive downsweep raking scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveDownsweep(
+        ScanOp          scan_op,
+        T               raking_partial,
+        bool            apply_prefix = true)
+    {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+        // Read data back into registers
+        if (!MEMOIZE)
+        {
+            CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
+        }
+
+        internal::ThreadScanInclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
+
+        // Write data back to smem
+        CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
+    }
+
+
+    //---------------------------------------------------------------------
+    // Constructors
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanRaking(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Exclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &exclusive_output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, exclusive_output, scan_op);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            exclusive_output = *placement_ptr;
+        }
+    }
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Exclusive Warp-synchronous scan
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, exclusive_partial);
+            }
+
+            CTA_SYNC();
+
+            // Grab exclusive partial from shared memory
+            output = *placement_ptr;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, scan_op, block_aggregate);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial= Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T inclusive_partial;
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+
+                // Broadcast aggregate to all threads
+                if (linear_tid == RAKING_THREADS - 1)
+                    temp_storage.block_aggregate = inclusive_partial;
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op, block_aggregate);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, exclusive_partial);
+
+                // Broadcast aggregate to other threads
+                if (linear_tid == 0)
+                    temp_storage.block_aggregate = block_aggregate;
+            }
+
+            CTA_SYNC();
+
+            // Grab exclusive partial from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            T block_aggregate;
+            WarpScan warp_scan(temp_storage.warp_scan);
+            warp_scan.ExclusiveScan(input, output, scan_op, block_aggregate);
+
+            // Obtain warp-wide prefix in lane0, then broadcast to other lanes
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            block_prefix = warp_scan.Broadcast(block_prefix, 0);
+
+            output = scan_op(block_prefix, output);
+            if (linear_tid == 0)
+                output = block_prefix;
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                WarpScan warp_scan(temp_storage.warp_scan);
+
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T exclusive_partial, block_aggregate;
+                warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
+
+                // Obtain block-wide prefix in lane0, then broadcast to other lanes
+                T block_prefix = block_prefix_callback_op(block_aggregate);
+                block_prefix = warp_scan.Broadcast(block_prefix, 0);
+
+                // Update prefix with warpscan exclusive partial
+                T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
+                if (linear_tid == 0)
+                    downsweep_prefix = block_prefix;
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, downsweep_prefix);
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Exclusive Warp-synchronous scan
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
+
+                // Inclusive raking downsweep scan
+                InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op, block_aggregate);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T inclusive_partial;
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
+
+                // Inclusive raking downsweep scan
+                InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+
+                // Broadcast aggregate to all threads
+                if (linear_tid == RAKING_THREADS - 1)
+                    temp_storage.block_aggregate = inclusive_partial;
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            T block_aggregate;
+            WarpScan warp_scan(temp_storage.warp_scan);
+            warp_scan.InclusiveScan(input, output, scan_op, block_aggregate);
+
+            // Obtain warp-wide prefix in lane0, then broadcast to other lanes
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            block_prefix = warp_scan.Broadcast(block_prefix, 0);
+
+            // Update prefix with exclusive warpscan partial
+            output = scan_op(block_prefix, output);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                WarpScan warp_scan(temp_storage.warp_scan);
+
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T exclusive_partial, block_aggregate;
+                warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
+
+                // Obtain block-wide prefix in lane0, then broadcast to other lanes
+                T block_prefix = block_prefix_callback_op(block_aggregate);
+                block_prefix = warp_scan.Broadcast(block_prefix, 0);
+
+                // Update prefix with warpscan exclusive partial
+                T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
+                if (linear_tid == 0)
+                    downsweep_prefix = block_prefix;
+
+                // Inclusive raking downsweep scan
+                InclusiveDownsweep(scan_op, downsweep_prefix);
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/block/specializations/block_scan_warp_scans.cuh b/cpp/nvgraph/external/cub_semiring/block/specializations/block_scan_warp_scans.cuh
new file mode 100644
index 00000000000..9252c0a3a7f
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/block/specializations/block_scan_warp_scans.cuh
@@ -0,0 +1,392 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../util_arch.cuh"
+#include "../../util_ptx.cuh"
+#include "../../warp/warp_scan.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+template <
+    typename    T,
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockScanWarpScans
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        /// Number of warp threads
+        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        /// Number of active warps
+        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+    };
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScanT;
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, WARPS, PTX_ARCH> WarpAggregateScan;
+
+    /// Shared memory storage layout type
+
+    struct __align__(32) _TempStorage
+    {
+        T                               warp_aggregates[WARPS];
+        typename WarpScanT::TempStorage warp_scan[WARPS];           ///< Buffer for warp-synchronous scans
+        T                               block_prefix;               ///< Shared prefix for the entire thread block
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    unsigned int    linear_tid;
+    unsigned int    warp_id;
+    unsigned int    lane_id;
+
+
+    //---------------------------------------------------------------------
+    // Constructors
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanWarpScans(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        lane_id(LaneId())
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    template <typename ScanOp, int WARP>
+    __device__ __forceinline__ void ApplyWarpAggregates(
+        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        Int2Type<WARP>  /*addend_warp*/)
+    {
+        if (warp_id == WARP)
+            warp_prefix = block_aggregate;
+
+        T addend = temp_storage.warp_aggregates[WARP];
+        block_aggregate = scan_op(block_aggregate, addend);
+
+        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<WARP + 1>());
+    }
+
+    template <typename ScanOp>
+    __device__ __forceinline__ void ApplyWarpAggregates(
+        T               &/*warp_prefix*/,       ///< [out] The calling thread's partial reduction
+        ScanOp          /*scan_op*/,            ///< [in] Binary scan operator
+        T               &/*block_aggregate*/,   ///< [out] Threadblock-wide aggregate reduction of input items
+        Int2Type<WARPS> /*addend_warp*/)
+    {}
+
+
+    /// Use the warp-wide aggregates to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
+    template <typename ScanOp>
+    __device__ __forceinline__ T ComputeWarpPrefix(
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
+
+        CTA_SYNC();
+
+        // Accumulate block aggregates and save the one that is our warp's prefix
+        T warp_prefix;
+        block_aggregate = temp_storage.warp_aggregates[0];
+
+        // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x)
+        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>());
+/*
+        #pragma unroll
+        for (int WARP = 1; WARP < WARPS; ++WARP)
+        {
+            if (warp_id == WARP)
+                warp_prefix = block_aggregate;
+
+            T addend = temp_storage.warp_aggregates[WARP];
+            block_aggregate = scan_op(block_aggregate, addend);
+        }
+*/
+
+        return warp_prefix;
+    }
+
+
+    /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
+    template <typename ScanOp>
+    __device__ __forceinline__ T ComputeWarpPrefix(
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        const T         &initial_value)     ///< [in] Initial value to seed the exclusive scan
+    {
+        T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate);
+
+        warp_prefix = scan_op(initial_value, warp_prefix);
+
+        if (warp_id == 0)
+            warp_prefix = initial_value;
+
+        return warp_prefix;
+    }
+
+    //---------------------------------------------------------------------
+    // Exclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item
+        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
+        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
+
+        // Apply warp prefix to our lane's partial
+        if (warp_id != 0)
+        {
+            exclusive_output = scan_op(warp_prefix, exclusive_output);
+            if (lane_id == 0)
+                exclusive_output = warp_prefix;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp
+        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value);
+
+        // Apply warp prefix to our lane's partial
+        exclusive_output = scan_op(warp_prefix, exclusive_output);
+        if (lane_id == 0)
+            exclusive_output = warp_prefix;
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+
+        // Use the first warp to determine the thread block prefix, returning the result in lane0
+        if (warp_id == 0)
+        {
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            if (lane_id == 0)
+            {
+                // Share the prefix with all threads
+                temp_storage.block_prefix = block_prefix;
+                exclusive_output = block_prefix;                // The block prefix is the exclusive output for tid0
+            }
+        }
+
+        CTA_SYNC();
+
+        // Incorporate thread block prefix into outputs
+        T block_prefix = temp_storage.block_prefix;
+        if (linear_tid > 0)
+        {
+            exclusive_output = scan_op(block_prefix, exclusive_output);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
+        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
+
+        // Apply warp prefix to our lane's partial
+        if (warp_id != 0)
+        {
+            inclusive_output = scan_op(warp_prefix, inclusive_output);
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        T block_aggregate;
+        InclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+
+        // Use the first warp to determine the thread block prefix, returning the result in lane0
+        if (warp_id == 0)
+        {
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            if (lane_id == 0)
+            {
+                // Share the prefix with all threads
+                temp_storage.block_prefix = block_prefix;
+            }
+        }
+
+        CTA_SYNC();
+
+        // Incorporate thread block prefix into outputs
+        T block_prefix = temp_storage.block_prefix;
+        exclusive_output = scan_op(block_prefix, exclusive_output);
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/block/specializations/block_scan_warp_scans2.cuh b/cpp/nvgraph/external/cub_semiring/block/specializations/block_scan_warp_scans2.cuh
new file mode 100644
index 00000000000..eb0a3a1b54e
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/block/specializations/block_scan_warp_scans2.cuh
@@ -0,0 +1,436 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../util_arch.cuh"
+#include "../../util_ptx.cuh"
+#include "../../warp/warp_scan.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+template <
+    typename    T,
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockScanWarpScans
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        /// Number of warp threads
+        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        /// Number of active warps
+        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+    };
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScanT;
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, WARPS, PTX_ARCH> WarpAggregateScanT;
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        typename WarpAggregateScanT::TempStorage    inner_scan[WARPS];          ///< Buffer for warp-synchronous scans
+        typename WarpScanT::TempStorage             warp_scan[WARPS];           ///< Buffer for warp-synchronous scans
+        T                                           warp_aggregates[WARPS];
+        T                                           block_prefix;               ///< Shared prefix for the entire thread block
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    unsigned int    linear_tid;
+    unsigned int    warp_id;
+    unsigned int    lane_id;
+
+
+    //---------------------------------------------------------------------
+    // Constructors
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanWarpScans(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        lane_id(LaneId())
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    template <typename ScanOp, int WARP>
+    __device__ __forceinline__ void ApplyWarpAggregates(
+        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        Int2Type<WARP>  addend_warp)
+    {
+        if (warp_id == WARP)
+            warp_prefix = block_aggregate;
+
+        T addend = temp_storage.warp_aggregates[WARP];
+        block_aggregate = scan_op(block_aggregate, addend);
+
+        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<WARP + 1>());
+    }
+
+    template <typename ScanOp>
+    __device__ __forceinline__ void ApplyWarpAggregates(
+        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        Int2Type<WARPS> addend_warp)
+    {}
+
+
+    /// Use the warp-wide aggregates to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
+    template <typename ScanOp>
+    __device__ __forceinline__ T ComputeWarpPrefix(
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
+
+        CTA_SYNC();
+
+        // Accumulate block aggregates and save the one that is our warp's prefix
+        T warp_prefix;
+        block_aggregate = temp_storage.warp_aggregates[0];
+
+        // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x)
+        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>());
+/*
+        #pragma unroll
+        for (int WARP = 1; WARP < WARPS; ++WARP)
+        {
+            if (warp_id == WARP)
+                warp_prefix = block_aggregate;
+
+            T addend = temp_storage.warp_aggregates[WARP];
+            block_aggregate = scan_op(block_aggregate, addend);
+        }
+*/
+
+        return warp_prefix;
+    }
+
+
+    /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
+    template <typename ScanOp>
+    __device__ __forceinline__ T ComputeWarpPrefix(
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        const T         &initial_value)     ///< [in] Initial value to seed the exclusive scan
+    {
+        T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate);
+
+        warp_prefix = scan_op(initial_value, warp_prefix);
+
+        if (warp_id == 0)
+            warp_prefix = initial_value;
+
+        return warp_prefix;
+    }
+
+    //---------------------------------------------------------------------
+    // Exclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item
+        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]);
+
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
+//        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
+
+//--------------------------------------------------
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        // Get the warp scan partial
+        T warp_inclusive, warp_prefix;
+        if (lane_id < WARPS)
+        {
+            // Scan the warpscan partials
+            T warp_val = temp_storage.warp_aggregates[lane_id];
+            WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, scan_op);
+        }
+
+        warp_prefix         = my_warp_scan.Broadcast(warp_prefix, warp_id);
+        block_aggregate     = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1);
+//--------------------------------------------------
+
+        // Apply warp prefix to our lane's partial
+        if (warp_id != 0)
+        {
+            exclusive_output = scan_op(warp_prefix, exclusive_output);
+            if (lane_id == 0)
+                exclusive_output = warp_prefix;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]);
+
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp
+//        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value);
+
+//--------------------------------------------------
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        // Get the warp scan partial
+        T warp_inclusive, warp_prefix;
+        if (lane_id < WARPS)
+        {
+            // Scan the warpscan partials
+            T warp_val = temp_storage.warp_aggregates[lane_id];
+            WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, initial_value, scan_op);
+        }
+
+        warp_prefix         = my_warp_scan.Broadcast(warp_prefix, warp_id);
+        block_aggregate     = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1);
+//--------------------------------------------------
+
+        // Apply warp prefix to our lane's partial
+        exclusive_output = scan_op(warp_prefix, exclusive_output);
+        if (lane_id == 0)
+            exclusive_output = warp_prefix;
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+
+        // Use the first warp to determine the thread block prefix, returning the result in lane0
+        if (warp_id == 0)
+        {
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            if (lane_id == 0)
+            {
+                // Share the prefix with all threads
+                temp_storage.block_prefix = block_prefix;
+                exclusive_output = block_prefix;                // The block prefix is the exclusive output for tid0
+            }
+        }
+
+        CTA_SYNC();
+
+        // Incorporate thread block prefix into outputs
+        T block_prefix = temp_storage.block_prefix;
+        if (linear_tid > 0)
+        {
+            exclusive_output = scan_op(block_prefix, exclusive_output);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
+        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
+
+        // Apply warp prefix to our lane's partial
+        if (warp_id != 0)
+        {
+            inclusive_output = scan_op(warp_prefix, inclusive_output);
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        T block_aggregate;
+        InclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+
+        // Use the first warp to determine the thread block prefix, returning the result in lane0
+        if (warp_id == 0)
+        {
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            if (lane_id == 0)
+            {
+                // Share the prefix with all threads
+                temp_storage.block_prefix = block_prefix;
+            }
+        }
+
+        CTA_SYNC();
+
+        // Incorporate thread block prefix into outputs
+        T block_prefix = temp_storage.block_prefix;
+        exclusive_output = scan_op(block_prefix, exclusive_output);
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/block/specializations/block_scan_warp_scans3.cuh b/cpp/nvgraph/external/cub_semiring/block/specializations/block_scan_warp_scans3.cuh
new file mode 100644
index 00000000000..18bd585823a
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/block/specializations/block_scan_warp_scans3.cuh
@@ -0,0 +1,418 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../util_arch.cuh"
+#include "../../util_ptx.cuh"
+#include "../../warp/warp_scan.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+template <
+    typename    T,
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockScanWarpScans
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        /// Number of warp threads
+        INNER_WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+        OUTER_WARP_THREADS = BLOCK_THREADS / INNER_WARP_THREADS,
+
+        /// Number of outer scan warps
+        OUTER_WARPS = INNER_WARP_THREADS
+    };
+
+    ///  Outer WarpScan utility type
+    typedef WarpScan<T, OUTER_WARP_THREADS, PTX_ARCH> OuterWarpScanT;
+
+    ///  Inner WarpScan utility type
+    typedef WarpScan<T, INNER_WARP_THREADS, PTX_ARCH> InnerWarpScanT;
+
+    typedef typename OuterWarpScanT::TempStorage OuterScanArray[OUTER_WARPS];
+
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        union Aliasable
+        {
+            Uninitialized<OuterScanArray>           outer_warp_scan;  ///< Buffer for warp-synchronous outer scans
+            typename InnerWarpScanT::TempStorage    inner_warp_scan;  ///< Buffer for warp-synchronous inner scan
+
+        } aliasable;
+
+        T                               warp_aggregates[OUTER_WARPS];
+
+        T                               block_aggregate;                           ///< Shared prefix for the entire thread block
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    unsigned int    linear_tid;
+    unsigned int    warp_id;
+    unsigned int    lane_id;
+
+
+    //---------------------------------------------------------------------
+    // Constructors
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanWarpScans(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((OUTER_WARPS == 1) ? 0 : linear_tid / OUTER_WARP_THREADS),
+        lane_id((OUTER_WARPS == 1) ? linear_tid : linear_tid % OUTER_WARP_THREADS)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Exclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item
+        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
+            input, inclusive_output, exclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
+            T outer_warp_exclusive;
+
+            InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
+                outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate);
+
+            temp_storage.block_aggregate                = block_aggregate;
+            temp_storage.warp_aggregates[linear_tid]    = outer_warp_exclusive;
+        }
+
+        CTA_SYNC();
+
+        if (warp_id != 0)
+        {
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+
+            // Apply warp prefix to our lane's partial
+            T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+            exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
+            if (lane_id == 0)
+                exclusive_output = outer_warp_exclusive;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
+            input, inclusive_output, exclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+        {
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+        }
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
+            T outer_warp_exclusive;
+
+            InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
+                outer_warp_input, outer_warp_exclusive, initial_value, scan_op, block_aggregate);
+
+            temp_storage.block_aggregate                = block_aggregate;
+            temp_storage.warp_aggregates[linear_tid]    = outer_warp_exclusive;
+        }
+
+        CTA_SYNC();
+
+        // Retrieve block aggregate
+        block_aggregate = temp_storage.block_aggregate;
+
+        // Apply warp prefix to our lane's partial
+        T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+        exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
+        if (lane_id == 0)
+            exclusive_output = outer_warp_exclusive;
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  The call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
+            input, inclusive_output, exclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan);
+
+            T upsweep = temp_storage.warp_aggregates[linear_tid];
+            T downsweep_prefix, block_aggregate;
+
+            inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate);
+
+            // Use callback functor to get block prefix in lane0 and then broadcast to other lanes
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            block_prefix = inner_scan.Broadcast(block_prefix, 0);
+
+            downsweep_prefix = scan_op(block_prefix, downsweep_prefix);
+            if (linear_tid == 0)
+                downsweep_prefix = block_prefix;
+
+            temp_storage.warp_aggregates[linear_tid] = downsweep_prefix;
+        }
+
+        CTA_SYNC();
+
+        // Apply warp prefix to our lane's partial (or assign it if partial is invalid)
+        T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+        exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
+        if (lane_id == 0)
+            exclusive_output = outer_warp_exclusive;
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan(
+            input, inclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
+            T outer_warp_exclusive;
+
+            InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
+                outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate);
+
+            temp_storage.block_aggregate                = block_aggregate;
+            temp_storage.warp_aggregates[linear_tid]    = outer_warp_exclusive;
+        }
+
+        CTA_SYNC();
+
+        if (warp_id != 0)
+        {
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+
+            // Apply warp prefix to our lane's partial
+            T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+            inclusive_output = scan_op(outer_warp_exclusive, inclusive_output);
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan(
+            input, inclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan);
+
+            T upsweep = temp_storage.warp_aggregates[linear_tid];
+            T downsweep_prefix, block_aggregate;
+            inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate);
+
+            // Use callback functor to get block prefix in lane0 and then broadcast to other lanes
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            block_prefix = inner_scan.Broadcast(block_prefix, 0);
+
+            downsweep_prefix = scan_op(block_prefix, downsweep_prefix);
+            if (linear_tid == 0)
+                downsweep_prefix = block_prefix;
+
+            temp_storage.warp_aggregates[linear_tid]    = downsweep_prefix;
+        }
+
+        CTA_SYNC();
+
+        // Apply warp prefix to our lane's partial
+        T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+        inclusive_output = scan_op(outer_warp_exclusive, inclusive_output);
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/cub.cuh b/cpp/nvgraph/external/cub_semiring/cub.cuh
new file mode 100644
index 00000000000..b1c8e3200ab
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/cub.cuh
@@ -0,0 +1,95 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * CUB umbrella include file
+ */
+
+#pragma once
+
+
+// Block
+#include "block/block_histogram.cuh"
+#include "block/block_discontinuity.cuh"
+#include "block/block_exchange.cuh"
+#include "block/block_load.cuh"
+#include "block/block_radix_rank.cuh"
+#include "block/block_radix_sort.cuh"
+#include "block/block_reduce.cuh"
+#include "block/block_scan.cuh"
+#include "block/block_store.cuh"
+//#include "block/block_shift.cuh"
+
+// Device
+#include "device/device_histogram.cuh"
+#include "device/device_partition.cuh"
+#include "device/device_radix_sort.cuh"
+#include "device/device_reduce.cuh"
+#include "device/device_run_length_encode.cuh"
+#include "device/device_scan.cuh"
+#include "device/device_segmented_radix_sort.cuh"
+#include "device/device_segmented_reduce.cuh"
+#include "device/device_select.cuh"
+#include "device/device_spmv.cuh"
+
+// Grid
+//#include "grid/grid_barrier.cuh"
+#include "grid/grid_even_share.cuh"
+#include "grid/grid_mapping.cuh"
+#include "grid/grid_queue.cuh"
+
+// Thread
+#include "thread/thread_load.cuh"
+#include "thread/thread_operators.cuh"
+#include "thread/thread_reduce.cuh"
+#include "thread/thread_scan.cuh"
+#include "thread/thread_store.cuh"
+
+// Warp
+#include "warp/warp_reduce.cuh"
+#include "warp/warp_scan.cuh"
+
+// Iterator
+#include "iterator/arg_index_input_iterator.cuh"
+#include "iterator/cache_modified_input_iterator.cuh"
+#include "iterator/cache_modified_output_iterator.cuh"
+#include "iterator/constant_input_iterator.cuh"
+#include "iterator/counting_input_iterator.cuh"
+#include "iterator/tex_obj_input_iterator.cuh"
+#include "iterator/tex_ref_input_iterator.cuh"
+#include "iterator/transform_input_iterator.cuh"
+
+// Util
+#include "util_arch.cuh"
+#include "util_debug.cuh"
+#include "util_device.cuh"
+#include "util_macro.cuh"
+#include "util_ptx.cuh"
+#include "util_type.cuh"
+
diff --git a/cpp/nvgraph/external/cub_semiring/device/device_histogram.cuh b/cpp/nvgraph/external/cub_semiring/device/device_histogram.cuh
new file mode 100644
index 00000000000..db131eee764
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/device/device_histogram.cuh
@@ -0,0 +1,866 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+#include <limits>
+
+#include "dispatch/dispatch_histogram.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. ![](histogram_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
+ * counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceHistogram}
+ *
+ */
+struct DeviceHistogram
+{
+    /******************************************************************//**
+     * \name Evenly-segmented bin ranges
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins.
+     *
+     * \par
+     * - The number of histogram bins is (\p num_levels - 1)
+     * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of a six-bin histogram
+     * from a sequence of float samples
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples and
+     * // output histogram
+     * int      num_samples;    // e.g., 10
+     * float*   d_samples;      // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5]
+     * int*     d_histogram;    // e.g., [ -, -, -, -, -, -, -, -]
+     * int      num_levels;     // e.g., 7       (seven level boundaries for six bins)
+     * float    lower_level;    // e.g., 0.0     (lower sample value boundary of lowest bin)
+     * float    upper_level;    // e.g., 12.0    (upper sample value boundary of upper bin)
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples);
+     *
+     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
+     *
+     * \endcode
+     *
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t HistogramEven(
+        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of data samples.
+        CounterT*           d_histogram,                                ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
+        int                 num_levels,                                 ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
+        LevelT              lower_level,                                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin.
+        LevelT              upper_level,                                ///< [in] The upper sample value bound (exclusive) for the highest histogram bin.
+        OffsetT             num_samples,                                ///< [in] The number of input samples (i.e., the length of \p d_samples)
+        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+        CounterT*           d_histogram1[1]     = {d_histogram};
+        int                 num_levels1[1]      = {num_levels};
+        LevelT              lower_level1[1]     = {lower_level};
+        LevelT              upper_level1[1]     = {upper_level};
+
+        return MultiHistogramEven<1, 1>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram1,
+            num_levels1,
+            lower_level1,
+            upper_level1,
+            num_samples,
+            1,
+            sizeof(SampleT) * num_samples,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins.
+     *
+     * \par
+     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
+     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
+     * - The row stride must be a whole multiple of the sample data type
+     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
+     * - The number of histogram bins is (\p num_levels - 1)
+     * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of a six-bin histogram
+     * from a 2x5 region of interest within a flattened 2x7 array of float samples.
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples and
+     * // output histogram
+     * int      num_row_samples;    // e.g., 5
+     * int      num_rows;           // e.g., 2;
+     * size_t   row_stride_bytes;   // e.g., 7 * sizeof(float)
+     * float*   d_samples;          // e.g., [2.2, 6.0, 7.1, 2.9, 3.5,   -, -,
+     *                              //        0.3, 2.9, 2.0, 6.1, 999.5, -, -]
+     * int*     d_histogram;        // e.g., [ -, -, -, -, -, -, -, -]
+     * int      num_levels;         // e.g., 7       (seven level boundaries for six bins)
+     * float    lower_level;        // e.g., 0.0     (lower sample value boundary of lowest bin)
+     * float    upper_level;        // e.g., 12.0    (upper sample value boundary of upper bin)
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage  = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
+     *     num_row_samples, num_rows, row_stride_bytes);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, d_samples, d_histogram,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
+     *     num_row_samples, num_rows, row_stride_bytes);
+     *
+     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
+     *
+     * \endcode
+     *
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t HistogramEven(
+        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of data samples.
+        CounterT*           d_histogram,                                ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
+        int                 num_levels,                                 ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
+        LevelT              lower_level,                                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin.
+        LevelT              upper_level,                                ///< [in] The upper sample value bound (exclusive) for the highest histogram bin.
+        OffsetT             num_row_samples,                            ///< [in] The number of data samples per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        size_t              row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        CounterT*           d_histogram1[1]     = {d_histogram};
+        int                 num_levels1[1]      = {num_levels};
+        LevelT              lower_level1[1]     = {lower_level};
+        LevelT              upper_level1[1]     = {upper_level};
+
+        return MultiHistogramEven<1, 1>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram1,
+            num_levels1,
+            lower_level1,
+            upper_level1,
+            num_row_samples,
+            num_rows,
+            row_stride_bytes,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
+     *
+     * \par
+     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
+     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
+     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
+     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., only <em>RGB</em> histograms from <em>RGBA</em>
+     *   pixel samples).
+     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
+     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of three 256-bin <em>RGB</em> histograms
+     * from a quad-channel sequence of <em>RGBA</em> pixels (8 bits per channel per pixel)
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples
+     * // and output histograms
+     * int              num_pixels;         // e.g., 5
+     * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2),
+     *                                      //        (0, 6, 7, 5), (3, 0, 2, 6)]
+     * int*             d_histogram[3];     // e.g., three device pointers to three device buffers,
+     *                                      //       each allocated with 256 integer counters
+     * int              num_levels[3];      // e.g., {257, 257, 257};
+     * unsigned int     lower_level[3];     // e.g., {0, 0, 0};
+     * unsigned int     upper_level[3];     // e.g., {256, 256, 256};
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels);
+     *
+     * // d_histogram   <-- [ [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
+     * //                     [0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
+     * //                     [0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
+     *
+     * \endcode
+     *
+     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        int                 NUM_CHANNELS,
+        int                 NUM_ACTIVE_CHANNELS,
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t MultiHistogramEven(
+        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
+        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],           ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_pixels,                                 ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS)
+        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+        return MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram,
+            num_levels,
+            lower_level,
+            upper_level,
+            num_pixels,
+            1,
+            sizeof(SampleT) * NUM_CHANNELS * num_pixels,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
+     *
+     * \par
+     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
+     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
+     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
+     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., only <em>RGB</em> histograms from <em>RGBA</em>
+     *   pixel samples).
+     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
+     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
+     * - The row stride must be a whole multiple of the sample data type
+     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
+     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
+     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of three 256-bin <em>RGB</em> histograms from a 2x3 region of
+     * interest of within a flattened 2x4 array of quad-channel <em>RGBA</em> pixels (8 bits per channel per pixel).
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples
+     * // and output histograms
+     * int              num_row_pixels;     // e.g., 3
+     * int              num_rows;           // e.g., 2
+     * size_t           row_stride_bytes;   // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
+     * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), (-, -, -, -),
+     *                                      //        (0, 6, 7, 5), (3, 0, 2, 6), (1, 1, 1, 1), (-, -, -, -)]
+     * int*             d_histogram[3];     // e.g., three device pointers to three device buffers,
+     *                                      //       each allocated with 256 integer counters
+     * int              num_levels[3];      // e.g., {257, 257, 257};
+     * unsigned int     lower_level[3];     // e.g., {0, 0, 0};
+     * unsigned int     upper_level[3];     // e.g., {256, 256, 256};
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
+     *     num_row_pixels, num_rows, row_stride_bytes);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
+     *     num_row_pixels, num_rows, row_stride_bytes);
+     *
+     * // d_histogram   <-- [ [1, 1, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
+     * //                     [0, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
+     * //                     [0, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
+     *
+     * \endcode
+     *
+     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        int                 NUM_CHANNELS,
+        int                 NUM_ACTIVE_CHANNELS,
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t MultiHistogramEven(
+        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
+        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],           ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        size_t              row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+        Int2Type<sizeof(SampleT) == 1> is_byte_sample;
+
+        if ((sizeof(OffsetT) > sizeof(int)) &&
+            ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits<int>::max()))
+        {
+            // Down-convert OffsetT data type
+
+
+            return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchEven(
+                d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level,
+                (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)),
+                stream, debug_synchronous, is_byte_sample);
+        }
+
+        return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchEven(
+            d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level,
+            num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)),
+            stream, debug_synchronous, is_byte_sample);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Custom bin ranges
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
+     *
+     * \par
+     * - The number of histogram bins is (\p num_levels - 1)
+     * - The value range for bin<sub><em>i</em></sub> is [<tt>level[i]</tt>, <tt>level[i+1]</tt>)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of an six-bin histogram
+     * from a sequence of float samples
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples and
+     * // output histogram
+     * int      num_samples;    // e.g., 10
+     * float*   d_samples;      // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5]
+     * int*     d_histogram;    // e.g., [ -, -, -, -, -, -, -, -]
+     * int      num_levels      // e.g., 7 (seven level boundaries for six bins)
+     * float*   d_levels;       // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_samples);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_samples);
+     *
+     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
+     *
+     * \endcode
+     *
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t HistogramRange(
+        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the input sequence of data samples.
+        CounterT*           d_histogram,                            ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
+        int                 num_levels,                             ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
+        LevelT*             d_levels,                               ///< [in] The pointer to the array of boundaries (levels).  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_samples,                            ///< [in] The number of data samples per row in the region of interest
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+        CounterT*           d_histogram1[1] = {d_histogram};
+        int                 num_levels1[1]  = {num_levels};
+        LevelT*             d_levels1[1]    = {d_levels};
+
+        return MultiHistogramRange<1, 1>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram1,
+            num_levels1,
+            d_levels1,
+            num_samples,
+            1,
+            sizeof(SampleT) * num_samples,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
+     *
+     * \par
+     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
+     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
+     * - The row stride must be a whole multiple of the sample data type
+     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
+     * - The number of histogram bins is (\p num_levels - 1)
+     * - The value range for bin<sub><em>i</em></sub> is [<tt>level[i]</tt>, <tt>level[i+1]</tt>)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of a six-bin histogram
+     * from a 2x5 region of interest within a flattened 2x7 array of float samples.
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples and
+     * // output histogram
+     * int      num_row_samples;    // e.g., 5
+     * int      num_rows;           // e.g., 2;
+     * int      row_stride_bytes;   // e.g., 7 * sizeof(float)
+     * float*   d_samples;          // e.g., [2.2, 6.0, 7.1, 2.9, 3.5,   -, -,
+     *                              //        0.3, 2.9, 2.0, 6.1, 999.5, -, -]
+     * int*     d_histogram;        // e.g., [ , , , , , , , ]
+     * int      num_levels          // e.g., 7 (seven level boundaries for six bins)
+     * float    *d_levels;          // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels,
+     *     num_row_samples, num_rows, row_stride_bytes);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels,
+     *     num_row_samples, num_rows, row_stride_bytes);
+     *
+     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
+     *
+     * \endcode
+     *
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t HistogramRange(
+        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the input sequence of data samples.
+        CounterT*           d_histogram,                            ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
+        int                 num_levels,                             ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
+        LevelT*             d_levels,                               ///< [in] The pointer to the array of boundaries (levels).  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_samples,                        ///< [in] The number of data samples per row in the region of interest
+        OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
+        size_t              row_stride_bytes,                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        CounterT*           d_histogram1[1]     = {d_histogram};
+        int                 num_levels1[1]      = {num_levels};
+        LevelT*             d_levels1[1]        = {d_levels};
+
+        return MultiHistogramRange<1, 1>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram1,
+            num_levels1,
+            d_levels1,
+            num_row_samples,
+            num_rows,
+            row_stride_bytes,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
+     *
+     * \par
+     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
+     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
+     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
+     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., <em>RGB</em> histograms from <em>RGBA</em>
+     *   pixel samples).
+     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
+     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of three 4-bin <em>RGB</em> histograms
+     * from a quad-channel sequence of <em>RGBA</em> pixels (8 bits per channel per pixel)
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples
+     * // and output histograms
+     * int            num_pixels;       // e.g., 5
+     * unsigned char  *d_samples;       // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(7, 0, 6, 2),
+     *                                  //        (0, 6, 7, 5),(3, 0, 2, 6)]
+     * unsigned int   *d_histogram[3];  // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
+     * int            num_levels[3];    // e.g., {5, 5, 5};
+     * unsigned int   *d_levels[3];     // e.g., [ [0, 2, 4, 6, 8],
+     *                                  //         [0, 2, 4, 6, 8],
+     *                                  //         [0, 2, 4, 6, 8] ];
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_pixels);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_pixels);
+     *
+     * // d_histogram   <-- [ [1, 3, 0, 1],
+     * //                     [3, 0, 0, 2],
+     * //                     [0, 2, 0, 3] ]
+     *
+     * \endcode
+     *
+     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        int                 NUM_CHANNELS,
+        int                 NUM_ACTIVE_CHANNELS,
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t MultiHistogramRange(
+        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
+        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT*             d_levels[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_pixels,                             ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+        return MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram,
+            num_levels,
+            d_levels,
+            num_pixels,
+            1,
+            sizeof(SampleT) * NUM_CHANNELS * num_pixels,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
+     *
+     * \par
+     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
+     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
+     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
+     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., <em>RGB</em> histograms from <em>RGBA</em>
+     *   pixel samples).
+     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
+     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
+     * - The row stride must be a whole multiple of the sample data type
+     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
+     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
+     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of three 4-bin <em>RGB</em> histograms from a 2x3 region of
+     * interest of within a flattened 2x4 array of quad-channel <em>RGBA</em> pixels (8 bits per channel per pixel).
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples
+     * // and output histograms
+     * int              num_row_pixels;     // e.g., 3
+     * int              num_rows;           // e.g., 2
+     * size_t           row_stride_bytes;   // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
+     * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(1, 1, 1, 1),(-, -, -, -),
+     *                                      //        (7, 0, 6, 2),(0, 6, 7, 5),(3, 0, 2, 6),(-, -, -, -)]
+     * int*             d_histogram[3];     // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
+     * int              num_levels[3];      // e.g., {5, 5, 5};
+     * unsigned int*    d_levels[3];        // e.g., [ [0, 2, 4, 6, 8],
+     *                                      //         [0, 2, 4, 6, 8],
+     *                                      //         [0, 2, 4, 6, 8] ];
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes);
+     *
+     * // d_histogram   <-- [ [2, 3, 0, 1],
+     * //                     [3, 0, 0, 2],
+     * //                     [1, 2, 0, 3] ]
+     *
+     * \endcode
+     *
+     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        int                 NUM_CHANNELS,
+        int                 NUM_ACTIVE_CHANNELS,
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t MultiHistogramRange(
+        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
+        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT*             d_levels[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_pixels,                         ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
+        size_t              row_stride_bytes,                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+        Int2Type<sizeof(SampleT) == 1> is_byte_sample;
+
+        if ((sizeof(OffsetT) > sizeof(int)) &&
+            ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits<int>::max()))
+        {
+            // Down-convert OffsetT data type
+            return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchRange(
+                d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
+                (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)),
+                stream, debug_synchronous, is_byte_sample);
+        }
+
+        return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchRange(
+            d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
+            num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)),
+            stream, debug_synchronous, is_byte_sample);
+    }
+
+
+
+    //@}  end member group
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/cpp/nvgraph/external/cub_semiring/device/device_partition.cuh b/cpp/nvgraph/external/cub_semiring/device/device_partition.cuh
new file mode 100644
index 00000000000..154506edcc0
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/device/device_partition.cuh
@@ -0,0 +1,273 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch/dispatch_select_if.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory. ![](partition_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * These operations apply a selection criterion to construct a partitioned output sequence from items selected/unselected from
+ * a specified input sequence.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DevicePartition}
+ *
+ * \par Performance
+ * \linear_performance{partition}
+ *
+ * \par
+ * The following chart illustrates DevicePartition::If
+ * performance across different CUDA architectures for \p int32 items,
+ * where 50% of the items are randomly selected for the first partition.
+ * \plots_below
+ *
+ * \image html partition_if_int32_50_percent.png
+ *
+ */
+struct DevicePartition
+{
+    /**
+     * \brief Uses the \p d_flags sequence to split the corresponding items from \p d_in into a partitioned sequence \p d_out.  The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_flags_logo.png)
+     *
+     * \par
+     * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.).
+     * - Copies of the selected items are compacted into \p d_out and maintain their original
+     *   relative ordering, however copies of the unselected items are compacted into the
+     *   rear of \p d_out in reverse order.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_partition.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output
+     * int  num_items;              // e.g., 8
+     * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+     * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
+     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int  *d_num_selected_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
+     *
+     * // d_out                 <-- [1, 4, 6, 7, 8, 5, 3, 2]
+     * // d_num_selected_out    <-- [4]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam FlagIterator         <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing output items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    FlagIterator,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Flagged(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
+        int                         num_items,                      ///< [in] Total number of items to select from
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType                SelectOp;       // Selection op (not used)
+        typedef NullType                EqualityOp;     // Equality operator (not used)
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_flags,
+            d_out,
+            d_num_selected_out,
+            SelectOp(),
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Uses the \p select_op functor to split the corresponding items from \p d_in into a partitioned sequence \p d_out.  The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_logo.png)
+     *
+     * \par
+     * - Copies of the selected items are compacted into \p d_out and maintain their original
+     *   relative ordering, however copies of the unselected items are compacted into the
+     *   rear of \p d_out in reverse order.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated partition-if performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.  Items are
+     * selected for the first partition with 50% probability.
+     *
+     * \image html partition_if_int32_50_percent.png
+     * \image html partition_if_int64_50_percent.png
+     *
+     * \par
+     * The following charts are similar, but 5% selection probability for the first partition:
+     *
+     * \image html partition_if_int32_5_percent.png
+     * \image html partition_if_int64_5_percent.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_partition.cuh>
+     *
+     * // Functor type for selecting values less than some criteria
+     * struct LessThan
+     * {
+     *     int compare;
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     LessThan(int compare) : compare(compare) {}
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     bool operator()(const int &a) const {
+     *         return (a < compare);
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int      num_items;              // e.g., 8
+     * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
+     * int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int      *d_num_selected_out;    // e.g., [ ]
+     * LessThan select_op(7);
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
+     *
+     * // d_out                 <-- [0, 2, 3, 5, 2, 8, 81, 9]
+     * // d_num_selected_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing output items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     * \tparam SelectOp             <b>[inferred]</b> Selection functor type having member <tt>bool operator()(const T &a)</tt>
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT,
+        typename                    SelectOp>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t If(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
+        int                         num_items,                      ///< [in] Total number of items to select from
+        SelectOp                    select_op,                      ///< [in] Unary selection operator
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
+        typedef NullType                EqualityOp;     // Equality operator (not used)
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            NULL,
+            d_out,
+            d_num_selected_out,
+            select_op,
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+};
+
+/**
+ * \example example_device_partition_flagged.cu
+ * \example example_device_partition_if.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/cpp/nvgraph/external/cub_semiring/device/device_radix_sort.cuh b/cpp/nvgraph/external/cub_semiring/device/device_radix_sort.cuh
new file mode 100644
index 00000000000..fe6cad65d7b
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/device/device_radix_sort.cuh
@@ -0,0 +1,796 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch/dispatch_radix_sort.cuh"
+#include "../util_arch.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. ![](sorting_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
+ * items into ascending (or descending) order.  The algorithm relies upon a positional representation for
+ * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
+ * characters, etc.) specified from least-significant to most-significant.  For a
+ * given input sequence of keys and a set of rules specifying a total ordering
+ * of the symbolic alphabet, the radix sorting method produces a lexicographic
+ * ordering of those keys.
+ *
+ * \par
+ * DeviceRadixSort can sort all of the built-in C++ numeric primitive types, e.g.:
+ * <tt>unsigned char</tt>, \p int, \p double, etc.  Although the direct radix sorting
+ * method can only be applied to unsigned integral types, DeviceRadixSort
+ * is able to sort signed and floating-point types via simple bit-wise transformations
+ * that ensure lexicographic key ordering.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceRadixSort}
+ *
+ * \par Performance
+ * \linear_performance{radix sort} The following chart illustrates DeviceRadixSort::SortKeys
+ * performance across different CUDA architectures for uniform-random \p uint32 keys.
+ * \plots_below
+ *
+ * \image html lsb_radix_sort_int32_keys.png
+ *
+ */
+struct DeviceRadixSort
+{
+
+    /******************************************************************//**
+     * \name KeyT-value pairs
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Sorts key-value pairs into ascending order. (~<em>2N </em>auxiliary storage required)
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sorting performance across different
+     * CUDA architectures for uniform-random <tt>uint32,uint32</tt> and
+     * <tt>uint64,uint64</tt> pairs, respectively.
+     *
+     * \image html lsb_radix_sort_int32_pairs.png
+     * \image html lsb_radix_sort_int64_pairs.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [        ...        ]
+     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_values_out;      // e.g., [        ...        ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+     *
+     * // d_keys_out            <-- [0, 3, 5, 6, 7, 8, 9]
+     * // d_values_out          <-- [5, 4, 3, 1, 2, 0, 6]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam ValueT    <b>[inferred]</b> ValueT type
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairs(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
+        const ValueT        *d_values_in,                           ///< [in] Pointer to the corresponding input sequence of associated value items
+        ValueT              *d_values_out,                          ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+        return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers and a corresponding
+     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+     *   structure that indicates which of the two buffers is "current" (and thus
+     *   contains the input data to be sorted).
+     * - The contents of both buffers within each pair may be altered by the sorting
+     *   operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within each DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sorting performance across different
+     * CUDA architectures for uniform-random <tt>uint32,uint32</tt> and
+     * <tt>uint64,uint64</tt> pairs, respectively.
+     *
+     * \image html lsb_radix_sort_int32_pairs.png
+     * \image html lsb_radix_sort_int64_pairs.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [        ...        ]
+     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_value_alt_buf;   // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+     *
+     * // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
+     * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam ValueT    <b>[inferred]</b> ValueT type
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairs(
+        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,                              ///< [in] Number of items to sort
+        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Performance
+     * Performance is similar to DeviceRadixSort::SortPairs.
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [        ...        ]
+     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_values_out;      // e.g., [        ...        ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+     *
+     * // d_keys_out            <-- [9, 8, 7, 6, 5, 3, 0]
+     * // d_values_out          <-- [6, 0, 2, 1, 3, 4, 5]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam ValueT    <b>[inferred]</b> ValueT type
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairsDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
+        const ValueT        *d_values_in,                           ///< [in] Pointer to the corresponding input sequence of associated value items
+        ValueT              *d_values_out,                          ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+        return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers and a corresponding
+     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+     *   structure that indicates which of the two buffers is "current" (and thus
+     *   contains the input data to be sorted).
+     * - The contents of both buffers within each pair may be altered by the sorting
+     *   operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within each DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Performance
+     * Performance is similar to DeviceRadixSort::SortPairs.
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [        ...        ]
+     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_value_alt_buf;   // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+     *
+     * // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
+     * // d_values.Current()    <-- [6, 0, 2, 1, 3, 4, 5]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam ValueT    <b>[inferred]</b> ValueT type
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairsDescending(
+        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,                              ///< [in] Number of items to sort
+        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Keys-only
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sorts keys into ascending order. (~<em>2N </em>auxiliary storage required)
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sorting performance across different
+     * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively.
+     *
+     * \image html lsb_radix_sort_int32_keys.png
+     * \image html lsb_radix_sort_int64_keys.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [        ...        ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+     *
+     * // d_keys_out            <-- [0, 3, 5, 6, 7, 8, 9]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     */
+    template <typename KeyT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeys(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<NullType>  d_values;
+
+        return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts keys into ascending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers managed by a
+     *   DoubleBuffer structure that indicates which of the two buffers is
+     *   "current" (and thus contains the input data to be sorted).
+     * - The contents of both buffers may be altered by the sorting operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within the DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sorting performance across different
+     * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively.
+     *
+     * \image html lsb_radix_sort_int32_keys.png
+     * \image html lsb_radix_sort_int64_keys.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     */
+    template <typename KeyT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeys(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<NullType> d_values;
+
+        return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Sorts keys into descending order. (~<em>2N</em> auxiliary storage required).
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Performance
+     * Performance is similar to DeviceRadixSort::SortKeys.
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+     *
+     * // d_keys_out            <-- [9, 8, 7, 6, 5, 3, 0]s
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     */
+    template <typename KeyT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeysDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<NullType>  d_values;
+
+        return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts keys into descending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers managed by a
+     *   DoubleBuffer structure that indicates which of the two buffers is
+     *   "current" (and thus contains the input data to be sorted).
+     * - The contents of both buffers may be altered by the sorting operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within the DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Performance
+     * Performance is similar to DeviceRadixSort::SortKeys.
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     */
+    template <typename KeyT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeysDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<NullType> d_values;
+
+        return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+
+
+};
+
+/**
+ * \example example_device_radix_sort.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/cpp/nvgraph/external/cub_semiring/device/device_reduce.cuh b/cpp/nvgraph/external/cub_semiring/device/device_reduce.cuh
new file mode 100644
index 00000000000..3939a7ee7bf
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/device/device_reduce.cuh
@@ -0,0 +1,734 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+#include <limits>
+
+#include "../iterator/arg_index_input_iterator.cuh"
+#include "dispatch/dispatch_reduce.cuh"
+#include "dispatch/dispatch_reduce_by_key.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. ![](reduce_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ * uses a binary combining operator to compute a single aggregate from a sequence of input elements.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceReduce}
+ *
+ * \par Performance
+ * \linear_performance{reduction, reduce-by-key, and run-length encode}
+ *
+ * \par
+ * The following chart illustrates DeviceReduce::Sum
+ * performance across different CUDA architectures for \p int32 keys.
+ *
+ * \image html reduce_int32.png
+ *
+ * \par
+ * The following chart illustrates DeviceReduce::ReduceByKey (summation)
+ * performance across different CUDA architectures for \p fp32
+ * values.  Segments are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000].
+ *
+ * \image html reduce_by_key_fp32_len_500.png
+ *
+ * \par
+ * \plots_below
+ *
+ */
+struct DeviceReduce
+{
+    /**
+     * \brief Computes a device-wide reduction using the specified binary \p reduction_op functor and initial value \p init.
+     *
+     * \par
+     * - Does not support binary reduction operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates a user-defined min-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     __device__ __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;  // e.g., 7
+     * int          *d_in;      // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_out;     // e.g., [-]
+     * CustomMin    min_op;
+     * int          init;       // e.g., INT_MAX
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run reduction
+     * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init);
+     *
+     * // d_out <-- [0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam ReductionOpT         <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
+     * \tparam T                    <b>[inferred]</b> Data element type that is convertible to the \p value type of \p InputIteratorT
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT,
+        typename                    ReductionOpT,
+        typename                    T>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Reduce(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        ReductionOpT                reduction_op,                       ///< [in] Binary reduction functor
+        T                           init,                               ///< [in] Initial value of the reduction
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            reduction_op,
+            init,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide sum using the addition (\p +) operator.
+     *
+     * \par
+     * - Uses \p 0 as the initial value of the reduction.
+     * - Does not support \p + operators that are non-commutative..
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sum-reduction performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.
+     *
+     * \image html reduce_int32.png
+     * \image html reduce_int64.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sum-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [-]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sum-reduction
+     * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // d_out <-- [38]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Sum(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The output value type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Sum>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            cub::Sum(),
+            OutputT(),            // zero-initialize
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide minimum using the less-than ('<') operator.
+     *
+     * \par
+     * - Uses <tt>std::numeric_limits<T>::max()</tt> as the initial value of the reduction.
+     * - Does not support \p < operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the min-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [-]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run min-reduction
+     * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // d_out <-- [0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Min(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input value type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Min>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            cub::Min(),
+            Traits<InputT>::Max(), // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Finds the first device-wide minimum using the less-than ('<') operator, also returning the index of that item.
+     *
+     * \par
+     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
+     *   - The minimum is written to <tt>d_out.value</tt> and its offset in the input array is written to <tt>d_out.key</tt>.
+     *   - The <tt>{1, std::numeric_limits<T>::max()}</tt> tuple is produced for zero-length inputs
+     * - Does not support \p < operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int                      num_items;      // e.g., 7
+     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run argmin-reduction
+     * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
+     *
+     * // d_out <-- [{5, 0}]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>cub::KeyValuePair<int, T></tt>) \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ArgMin(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
+
+        // The output tuple type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
+
+        // The output value type
+        typedef typename OutputTupleT::Value OutputValueT;
+
+        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
+        ArgIndexInputIteratorT d_indexed_in(d_in);
+
+        // Initial value
+        OutputTupleT initial_value(1, Traits<InputValueT>::Max());   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+
+        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_indexed_in,
+            d_out,
+            num_items,
+            cub::ArgMin(),
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide maximum using the greater-than ('>') operator.
+     *
+     * \par
+     * - Uses <tt>std::numeric_limits<T>::lowest()</tt> as the initial value of the reduction.
+     * - Does not support \p > operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the max-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [-]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run max-reduction
+     * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
+     *
+     * // d_out <-- [9]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Max(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input value type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Max>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            cub::Max(),
+            Traits<InputT>::Lowest(),    // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Finds the first device-wide maximum using the greater-than ('>') operator, also returning the index of that item
+     *
+     * \par
+     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
+     *   - The maximum is written to <tt>d_out.value</tt> and its offset in the input array is written to <tt>d_out.key</tt>.
+     *   - The <tt>{1, std::numeric_limits<T>::lowest()}</tt> tuple is produced for zero-length inputs
+     * - Does not support \p > operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int                      num_items;      // e.g., 7
+     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run argmax-reduction
+     * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
+     *
+     * // d_out <-- [{6, 9}]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>cub::KeyValuePair<int, T></tt>) \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ArgMax(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
+
+        // The output tuple type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
+
+        // The output value type
+        typedef typename OutputTupleT::Value OutputValueT;
+
+        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
+        ArgIndexInputIteratorT d_indexed_in(d_in);
+
+        // Initial value
+        OutputTupleT initial_value(1, Traits<InputValueT>::Lowest());     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+
+        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_indexed_in,
+            d_out,
+            num_items,
+            cub::ArgMax(),
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Reduces segments of values, where segments are demarcated by corresponding runs of identical keys.
+     *
+     * \par
+     * This operation computes segmented reductions within \p d_values_in using
+     * the specified binary \p reduction_op functor.  The segments are identified by
+     * "runs" of corresponding keys in \p d_keys_in, where runs are maximal ranges of
+     * consecutive, identical keys.  For the <em>i</em><sup>th</sup> run encountered,
+     * the first key of the run and the corresponding value aggregate of that run are
+     * written to <tt>d_unique_out[<em>i</em>]</tt> and <tt>d_aggregates_out[<em>i</em>]</tt>,
+     * respectively. The total number of runs encountered is written to \p d_num_runs_out.
+     *
+     * \par
+     * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following chart illustrates reduction-by-key (sum) performance across
+     * different CUDA architectures for \p fp32 and \p fp64 values, respectively.  Segments
+     * are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000].
+     *
+     * \image html reduce_by_key_fp32_len_500.png
+     * \image html reduce_by_key_fp64_len_500.png
+     *
+     * \par
+     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
+     *
+     * \image html reduce_by_key_fp32_len_5.png
+     * \image html reduce_by_key_fp64_len_5.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the segmented reduction of \p int values grouped
+     * by runs of associated \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;          // e.g., 8
+     * int          *d_keys_in;         // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+     * int          *d_values_in;       // e.g., [0, 7, 1, 6, 2, 5, 3, 4]
+     * int          *d_unique_out;      // e.g., [-, -, -, -, -, -, -, -]
+     * int          *d_aggregates_out;  // e.g., [-, -, -, -, -, -, -, -]
+     * int          *d_num_runs_out;    // e.g., [-]
+     * CustomMin    reduction_op;
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run reduce-by-key
+     * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items);
+     *
+     * // d_unique_out      <-- [0, 2, 9, 5, 8]
+     * // d_aggregates_out  <-- [0, 1, 6, 2, 4]
+     * // d_num_runs_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam KeysInputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input keys \iterator
+     * \tparam UniqueOutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing unique output keys \iterator
+     * \tparam ValuesInputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input values \iterator
+     * \tparam AggregatesOutputIterator <b>[inferred]</b> Random-access output iterator type for writing output value aggregates \iterator
+     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
+     * \tparam ReductionOpT              <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
+     */
+    template <
+        typename                    KeysInputIteratorT,
+        typename                    UniqueOutputIteratorT,
+        typename                    ValuesInputIteratorT,
+        typename                    AggregatesOutputIteratorT,
+        typename                    NumRunsOutputIteratorT,
+        typename                    ReductionOpT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t ReduceByKey(
+        void                        *d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeysInputIteratorT          d_keys_in,                      ///< [in] Pointer to the input sequence of keys
+        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
+        ValuesInputIteratorT        d_values_in,                    ///< [in] Pointer to the input sequence of corresponding values
+        AggregatesOutputIteratorT   d_aggregates_out,               ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
+        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
+        ReductionOpT                reduction_op,                   ///< [in] Binary reduction functor
+        int                         num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // FlagT iterator type (not used)
+
+        // Selection op (not used)
+
+        // Default == operator
+        typedef Equality EqualityOp;
+
+        return DispatchReduceByKey<KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, ReductionOpT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys_in,
+            d_unique_out,
+            d_values_in,
+            d_aggregates_out,
+            d_num_runs_out,
+            EqualityOp(),
+            reduction_op,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+};
+
+/**
+ * \example example_device_reduce.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/cpp/nvgraph/external/cub_semiring/device/device_run_length_encode.cuh b/cpp/nvgraph/external/cub_semiring/device/device_run_length_encode.cuh
new file mode 100644
index 00000000000..ed0bf9c7d67
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/device/device_run_length_encode.cuh
@@ -0,0 +1,278 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRunLengthEncode provides device-wide, parallel operations for computing a run-length encoding across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch/dispatch_rle.cuh"
+#include "dispatch/dispatch_reduce_by_key.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceRunLengthEncode provides device-wide, parallel operations for demarcating "runs" of same-valued items within a sequence residing within device-accessible memory. ![](run_length_encode_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Run-length_encoding"><em>run-length encoding</em></a>
+ * computes a simple compressed representation of a sequence of input elements such that each
+ * maximal "run" of consecutive same-valued data items is encoded as a single data value along with a
+ * count of the elements in that run.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceRunLengthEncode}
+ *
+ * \par Performance
+ * \linear_performance{run-length encode}
+ *
+ * \par
+ * The following chart illustrates DeviceRunLengthEncode::RunLengthEncode performance across
+ * different CUDA architectures for \p int32 items.
+ * Segments have lengths uniformly sampled from [1,1000].
+ *
+ * \image html rle_int32_len_500.png
+ *
+ * \par
+ * \plots_below
+ *
+ */
+struct DeviceRunLengthEncode
+{
+
+    /**
+     * \brief Computes a run-length encoding of the sequence \p d_in.
+     *
+     * \par
+     * - For the <em>i</em><sup>th</sup> run encountered, the first key of the run and its length are written to
+     *   <tt>d_unique_out[<em>i</em>]</tt> and <tt>d_counts_out[<em>i</em>]</tt>,
+     *   respectively.
+     * - The total number of runs encountered is written to \p d_num_runs_out.
+     * - The <tt>==</tt> equality operator is used to determine whether values are equivalent
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated encode performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.  Segments have
+     * lengths uniformly sampled from [1,1000].
+     *
+     * \image html rle_int32_len_500.png
+     * \image html rle_int64_len_500.png
+     *
+     * \par
+     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
+     *
+     * \image html rle_int32_len_5.png
+     * \image html rle_int64_len_5.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the run-length encoding of a sequence of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_run_length_encode.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;          // e.g., 8
+     * int          *d_in;              // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+     * int          *d_unique_out;      // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int          *d_counts_out;      // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int          *d_num_runs_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run encoding
+     * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items);
+     *
+     * // d_unique_out      <-- [0, 2, 9, 5, 8]
+     * // d_counts_out      <-- [1, 2, 1, 3, 1]
+     * // d_num_runs_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT           <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam UniqueOutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing unique output items \iterator
+     * \tparam LengthsOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing output counts \iterator
+     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    UniqueOutputIteratorT,
+        typename                    LengthsOutputIteratorT,
+        typename                    NumRunsOutputIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Encode(
+        void*                       d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of keys
+        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
+        LengthsOutputIteratorT      d_counts_out,                   ///< [out] Pointer to the output sequence of run-lengths (one count per run)
+        NumRunsOutputIteratorT      d_num_runs_out,                     ///< [out] Pointer to total number of runs
+        int                         num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int         OffsetT;                    // Signed integer type for global offsets
+        typedef NullType*   FlagIterator;               // FlagT iterator type (not used)
+        typedef NullType    SelectOp;                   // Selection op (not used)
+        typedef Equality    EqualityOp;                 // Default == operator
+        typedef cub::Sum    ReductionOp;                // Value reduction operator
+
+        // The lengths output value type
+        typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
+            OffsetT,                                                                                                    // ... then the OffsetT type,
+            typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
+
+        // Generator type for providing 1s values for run-length reduction
+        typedef ConstantInputIterator<LengthT, OffsetT> LengthsInputIteratorT;
+
+        return DispatchReduceByKey<InputIteratorT, UniqueOutputIteratorT, LengthsInputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, ReductionOp, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_unique_out,
+            LengthsInputIteratorT((LengthT) 1),
+            d_counts_out,
+            d_num_runs_out,
+            EqualityOp(),
+            ReductionOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Enumerates the starting offsets and lengths of all non-trivial runs (of length > 1) of same-valued keys in the sequence \p d_in.
+     *
+     * \par
+     * - For the <em>i</em><sup>th</sup> non-trivial run, the run's starting offset
+     *   and its length are written to <tt>d_offsets_out[<em>i</em>]</tt> and
+     *   <tt>d_lengths_out[<em>i</em>]</tt>, respectively.
+     * - The total number of runs encountered is written to \p d_num_runs_out.
+     * - The <tt>==</tt> equality operator is used to determine whether values are equivalent
+     * - \devicestorage
+     *
+     * \par Performance
+     *
+     * \par Snippet
+     * The code snippet below illustrates the identification of non-trivial runs within a sequence of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_run_length_encode.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;          // e.g., 8
+     * int          *d_in;              // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+     * int          *d_offsets_out;     // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int          *d_lengths_out;     // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int          *d_num_runs_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run encoding
+     * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items);
+     *
+     * // d_offsets_out         <-- [1, 4]
+     * // d_lengths_out         <-- [2, 3]
+     * // d_num_runs_out        <-- [2]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT           <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OffsetsOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing run-offset values \iterator
+     * \tparam LengthsOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing run-length values \iterator
+     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
+     */
+    template <
+        typename                InputIteratorT,
+        typename                OffsetsOutputIteratorT,
+        typename                LengthsOutputIteratorT,
+        typename                NumRunsOutputIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t NonTrivialRuns(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT          d_in,                           ///< [in] Pointer to input sequence of data items
+        OffsetsOutputIteratorT  d_offsets_out,                  ///< [out] Pointer to output sequence of run-offsets (one offset per non-trivial run)
+        LengthsOutputIteratorT  d_lengths_out,                  ///< [out] Pointer to output sequence of run-lengths (one count per non-trivial run)
+        NumRunsOutputIteratorT  d_num_runs_out,                 ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
+        int                     num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
+        cudaStream_t            stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int         OffsetT;                    // Signed integer type for global offsets
+        typedef Equality    EqualityOp;                 // Default == operator
+
+        return DeviceRleDispatch<InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_offsets_out,
+            d_lengths_out,
+            d_num_runs_out,
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/cpp/nvgraph/external/cub_semiring/device/device_scan.cuh b/cpp/nvgraph/external/cub_semiring/device/device_scan.cuh
new file mode 100644
index 00000000000..4589279eeb6
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/device/device_scan.cuh
@@ -0,0 +1,443 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch/dispatch_scan.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. ![](device_scan.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * Given a sequence of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
+ * produces an output sequence where each element is computed to be the reduction
+ * of the elements occurring earlier in the input sequence.  <em>Prefix sum</em>
+ * connotes a prefix scan with the addition operator. The term \em inclusive indicates
+ * that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
+ * The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
+ * the <em>i</em><sup>th</sup> output reduction.
+ *
+ * \par
+ * As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our <em>"decoupled look-back"</em> algorithm
+ * for performing global prefix scan with only a single pass through the
+ * input data, as described in our 2016 technical report [1].  The central
+ * idea is to leverage a small, constant factor of redundant work in order to overlap the latencies
+ * of global prefix propagation with local computation.  As such, our algorithm requires only
+ * ~2<em>n</em> data movement (<em>n</em> inputs are read, <em>n</em> outputs are written), and typically
+ * proceeds at "memcpy" speeds.
+ *
+ * \par
+ * [1] [Duane Merrill and Michael Garland.  "Single-pass Parallel Prefix Scan with Decoupled Look-back", <em>NVIDIA Technical Report NVR-2016-002</em>, 2016.](https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back)
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceScan}
+ *
+ * \par Performance
+ * \linear_performance{prefix scan}
+ *
+ * \par
+ * The following chart illustrates DeviceScan::ExclusiveSum
+ * performance across different CUDA architectures for \p int32 keys.
+ * \plots_below
+ *
+ * \image html scan_int32.png
+ *
+ */
+struct DeviceScan
+{
+    /******************************************************************//**
+     * \name Exclusive scans
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes a device-wide exclusive prefix sum.  The value of 0 is applied as the initial value, and is assigned to *d_out.
+     *
+     * \par
+     * - Supports non-commutative sum operators.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated exclusive sum performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.
+     *
+     * \image html scan_int32.png
+     * \image html scan_int64.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the exclusive prefix sum of an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run exclusive prefix sum
+     * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // d_out s<-- [0, 8, 14, 21, 26, 29, 29]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
+     */
+    template <
+        typename        InputIteratorT,
+        typename        OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ExclusiveSum(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
+        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The output value type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+        // Initial value
+        OutputT init_value = 0;
+
+        return DispatchScan<InputIteratorT, OutputIteratorT, Sum, OutputT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            Sum(),
+            init_value,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide exclusive prefix scan using the specified binary \p scan_op functor.  The \p init_value value is applied as the initial value, and is assigned to *d_out.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the exclusive prefix min-scan of an \p int device vector
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;      // e.g., 7
+     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * CustomMin    min_op
+     * ...
+     *
+     * // Determine temporary device storage requirements for exclusive prefix scan
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items);
+     *
+     * // Allocate temporary storage for exclusive prefix scan
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run exclusive prefix min-scan
+     * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items);
+     *
+     * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT   <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
+     * \tparam OutputIteratorT  <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
+     * \tparam ScanOp           <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam Identity         <b>[inferred]</b> Type of the \p identity value used Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename        InputIteratorT,
+        typename        OutputIteratorT,
+        typename        ScanOpT,
+        typename        InitValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ExclusiveScan(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
+        ScanOpT         scan_op,                            ///< [in] Binary scan functor
+        InitValueT      init_value,                         ///< [in] Initial value to seed the exclusive scan (and is assigned to *d_out)
+        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            scan_op,
+            init_value,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive scans
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a device-wide inclusive prefix sum.
+     *
+     * \par
+     * - Supports non-commutative sum operators.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the inclusive prefix sum of an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * ...
+     *
+     * // Determine temporary device storage requirements for inclusive prefix sum
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // Allocate temporary storage for inclusive prefix sum
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run inclusive prefix sum
+     * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // d_out <-- [8, 14, 21, 26, 29, 29, 38]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t InclusiveSum(
+        void*               d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                           ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                          ///< [out] Pointer to the output sequence of data items
+        int                 num_items,                      ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t        stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchScan<InputIteratorT, OutputIteratorT, Sum, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            Sum(),
+            NullType(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide inclusive prefix scan using the specified binary \p scan_op functor.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the inclusive prefix min-scan of an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;      // e.g., 7
+     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * CustomMin    min_op;
+     * ...
+     *
+     * // Determine temporary device storage requirements for inclusive prefix scan
+     * void *d_temp_storage = NULL;
+     * size_t temp_storage_bytes = 0;
+     * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items);
+     *
+     * // Allocate temporary storage for inclusive prefix scan
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run inclusive prefix min-scan
+     * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items);
+     *
+     * // d_out <-- [8, 6, 6, 5, 3, 0, 0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT   <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
+     * \tparam OutputIteratorT  <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
+     * \tparam ScanOp           <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename        InputIteratorT,
+        typename        OutputIteratorT,
+        typename        ScanOpT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t InclusiveScan(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
+        ScanOpT         scan_op,                            ///< [in] Binary scan functor
+        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream             = 0,             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous  = false)         ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            scan_op,
+            NullType(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+    //@}  end member group
+
+};
+
+/**
+ * \example example_device_scan.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/cpp/nvgraph/external/cub_semiring/device/device_segmented_radix_sort.cuh b/cpp/nvgraph/external/cub_semiring/device/device_segmented_radix_sort.cuh
new file mode 100644
index 00000000000..7f8bf8e7b3c
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/device/device_segmented_radix_sort.cuh
@@ -0,0 +1,875 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch/dispatch_radix_sort.cuh"
+#include "../util_arch.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory. ![](segmented_sorting_logo.png)
+ * \ingroup SegmentedModule
+ *
+ * \par Overview
+ * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
+ * items into ascending (or descending) order.  The algorithm relies upon a positional representation for
+ * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
+ * characters, etc.) specified from least-significant to most-significant.  For a
+ * given input sequence of keys and a set of rules specifying a total ordering
+ * of the symbolic alphabet, the radix sorting method produces a lexicographic
+ * ordering of those keys.
+ *
+ * \par
+ * DeviceSegmentedRadixSort can sort all of the built-in C++ numeric primitive types, e.g.:
+ * <tt>unsigned char</tt>, \p int, \p double, etc.  Although the direct radix sorting
+ * method can only be applied to unsigned integral types, DeviceSegmentedRadixSort
+ * is able to sort signed and floating-point types via simple bit-wise transformations
+ * that ensure lexicographic key ordering.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceSegmentedRadixSort}
+ *
+ */
+struct DeviceSegmentedRadixSort
+{
+
+    /******************************************************************//**
+     * \name Key-value pairs
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Sorts segments of key-value pairs into ascending order. (~<em>2N </em>auxiliary storage required)
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+     * // d_values_out          <-- [1, 2, 0, 5, 4, 3, 6]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam ValueT           <b>[inferred]</b> Value type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairs(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
+        const ValueT        *d_values_in,                           ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items
+        ValueT              *d_values_out,                          ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+        return DispatchSegmentedRadixSort<false, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers and a corresponding
+     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+     *   structure that indicates which of the two buffers is "current" (and thus
+     *   contains the input data to be sorted).
+     * - The contents of both buffers within each pair may be altered by the sorting
+     *   operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within each DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+     * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam ValueT           <b>[inferred]</b> Value type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename                KeyT,
+        typename                ValueT,
+        typename                OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairs(
+        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                     num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT         d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT         d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchSegmentedRadixSort<false, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+     * // d_values_out          <-- [0, 2, 1, 6, 3, 4, 5]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam ValueT           <b>[inferred]</b> Value type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairsDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
+        const ValueT        *d_values_in,                           ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items
+        ValueT              *d_values_out,                          ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+        return DispatchSegmentedRadixSort<true, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers and a corresponding
+     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+     *   structure that indicates which of the two buffers is "current" (and thus
+     *   contains the input data to be sorted).
+     * - The contents of both buffers within each pair may be altered by the sorting
+     *   operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within each DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+     * // d_values.Current()    <-- [0, 2, 1, 6, 3, 4, 5]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam ValueT           <b>[inferred]</b> Value type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename                KeyT,
+        typename                ValueT,
+        typename                OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairsDescending(
+        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                     num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT         d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT         d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchSegmentedRadixSort<true, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Keys-only
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sorts segments of keys into ascending order. (~<em>2N </em>auxiliary storage required)
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeys(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<NullType>  d_values;
+
+        return DispatchSegmentedRadixSort<false, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of keys into ascending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers managed by a
+     *   DoubleBuffer structure that indicates which of the two buffers is
+     *   "current" (and thus contains the input data to be sorted).
+     * - The contents of both buffers may be altered by the sorting operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within the DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeys(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<NullType> d_values;
+
+        return DispatchSegmentedRadixSort<false, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Sorts segments of keys into descending order. (~<em>2N</em> auxiliary storage required).
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeysDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<NullType>  d_values;
+
+        return DispatchSegmentedRadixSort<true, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of keys into descending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers managed by a
+     *   DoubleBuffer structure that indicates which of the two buffers is
+     *   "current" (and thus contains the input data to be sorted).
+     * - The contents of both buffers may be altered by the sorting operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within the DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeysDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<NullType> d_values;
+
+        return DispatchSegmentedRadixSort<true, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/cpp/nvgraph/external/cub_semiring/device/device_segmented_reduce.cuh b/cpp/nvgraph/external/cub_semiring/device/device_segmented_reduce.cuh
new file mode 100644
index 00000000000..1964ec1f1c4
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/device/device_segmented_reduce.cuh
@@ -0,0 +1,619 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSegmentedReduce provides device-wide, parallel operations for computing a batched reduction across multiple sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../iterator/arg_index_input_iterator.cuh"
+#include "dispatch/dispatch_reduce.cuh"
+#include "dispatch/dispatch_reduce_by_key.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceSegmentedReduce provides device-wide, parallel operations for computing a reduction across multiple sequences of data items residing within device-accessible memory. ![](reduce_logo.png)
+ * \ingroup SegmentedModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ * uses a binary combining operator to compute a single aggregate from a sequence of input elements.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceSegmentedReduce}
+ *
+ */
+struct DeviceSegmentedReduce
+{
+    /**
+     * \brief Computes a device-wide segmented reduction using the specified binary \p reduction_op functor.
+     *
+     * \par
+     * - Does not support binary reduction operators that are non-commutative.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates a custom min-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_segments;   // e.g., 3
+     * int          *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_out;         // e.g., [-, -, -]
+     * CustomMin    min_op;
+     * int          initial_value;           // e.g., INT_MAX
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1, min_op, initial_value);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run reduction
+     * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1, min_op, initial_value);
+     *
+     * // d_out <-- [6, INT_MAX, 0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam T                    <b>[inferred]</b> Data element type that is convertible to the \p value type of \p InputIteratorT
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT,
+        typename            ReductionOp,
+        typename            T>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Reduce(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        ReductionOp         reduction_op,                       ///< [in] Binary reduction functor 
+        T                   initial_value,                      ///< [in] Initial value of the reduction for each segment
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchSegmentedReduce<InputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, ReductionOp>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            reduction_op,
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide segmented sum using the addition ('+') operator.
+     *
+     * \par
+     * - Uses \p 0 as the initial value of the reduction for each segment.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p + operators that are non-commutative..
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sum reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int num_segments;   // e.g., 3
+     * int *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int *d_out;         // e.g., [-, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sum-reduction
+     * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [21, 0, 17]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Sum(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The output value type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::Sum>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::Sum(),
+            OutputT(),            // zero-initialize
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide segmented minimum using the less-than ('<') operator.
+     *
+     * \par
+     * - Uses <tt>std::numeric_limits<T>::max()</tt> as the initial value of the reduction for each segment.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p < operators that are non-commutative.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the min-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int num_segments;   // e.g., 3
+     * int *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int *d_out;         // e.g., [-, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run min-reduction
+     * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [6, INT_MAX, 0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Min(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input value type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::Min>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::Min(),
+            Traits<InputT>::Max(),    // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Finds the first device-wide minimum in each segment using the less-than ('<') operator, also returning the in-segment index of that item.
+     *
+     * \par
+     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
+     *   - The minimum of the <em>i</em><sup>th</sup> segment is written to <tt>d_out[i].value</tt> and its offset in that segment is written to <tt>d_out[i].key</tt>.
+     *   - The <tt>{1, std::numeric_limits<T>::max()}</tt> tuple is produced for zero-length inputs
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p < operators that are non-commutative.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int                      num_segments;   // e.g., 3
+     * int                      *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}, {-,-}, {-,-}]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run argmin-reduction
+     * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [{1,6}, {1,INT_MAX}, {2,0}]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>KeyValuePair<int, T></tt>) \iterator
+     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ArgMin(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
+
+        // The output tuple type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
+
+        // The output value type
+        typedef typename OutputTupleT::Value OutputValueT;
+
+        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
+        ArgIndexInputIteratorT d_indexed_in(d_in);
+
+        // Initial value
+        OutputTupleT initial_value(1, Traits<InputValueT>::Max());   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+
+        return DispatchSegmentedReduce<ArgIndexInputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::ArgMin>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_indexed_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::ArgMin(),
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide segmented maximum using the greater-than ('>') operator.
+     *
+     * \par
+     * - Uses <tt>std::numeric_limits<T>::lowest()</tt> as the initial value of the reduction.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p > operators that are non-commutative.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the max-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int num_segments;   // e.g., 3
+     * int *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int *d_out;         // e.g., [-, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run max-reduction
+     * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [8, INT_MIN, 9]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Max(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input value type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::Max>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::Max(),
+            Traits<InputT>::Lowest(),    // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Finds the first device-wide maximum in each segment using the greater-than ('>') operator, also returning the in-segment index of that item
+     *
+     * \par
+     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
+     *   - The maximum of the <em>i</em><sup>th</sup> segment is written to <tt>d_out[i].value</tt> and its offset in that segment is written to <tt>d_out[i].key</tt>.
+     *   - The <tt>{1, std::numeric_limits<T>::lowest()}</tt> tuple is produced for zero-length inputs
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p > operators that are non-commutative.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int                      num_segments;   // e.g., 3
+     * int                      *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}, {-,-}, {-,-}]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run argmax-reduction
+     * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [{0,8}, {1,INT_MIN}, {3,9}]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>KeyValuePair<int, T></tt>) \iterator
+     * \tparam OffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ArgMax(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
+
+        // The output tuple type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
+
+        // The output value type
+        typedef typename OutputTupleT::Value OutputValueT;
+
+        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
+        ArgIndexInputIteratorT d_indexed_in(d_in);
+
+        // Initial value
+        OutputTupleT initial_value(1, Traits<InputValueT>::Lowest());     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+
+        return DispatchSegmentedReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, cub::ArgMax>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_indexed_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::ArgMax(),
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/cpp/nvgraph/external/cub_semiring/device/device_select.cuh b/cpp/nvgraph/external/cub_semiring/device/device_select.cuh
new file mode 100644
index 00000000000..58bfe82ba30
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/device/device_select.cuh
@@ -0,0 +1,369 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch/dispatch_select_if.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory. ![](select_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * These operations apply a selection criterion to selectively copy
+ * items from a specified input sequence to a compact output sequence.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceSelect}
+ *
+ * \par Performance
+ * \linear_performance{select-flagged, select-if, and select-unique}
+ *
+ * \par
+ * The following chart illustrates DeviceSelect::If
+ * performance across different CUDA architectures for \p int32 items,
+ * where 50% of the items are randomly selected.
+ *
+ * \image html select_if_int32_50_percent.png
+ *
+ * \par
+ * The following chart illustrates DeviceSelect::Unique
+ * performance across different CUDA architectures for \p int32 items
+ * where segments have lengths uniformly sampled from [1,1000].
+ *
+ * \image html select_unique_int32_len_500.png
+ *
+ * \par
+ * \plots_below
+ *
+ */
+struct DeviceSelect
+{
+    /**
+     * \brief Uses the \p d_flags sequence to selectively copy the corresponding items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_flags_logo.png)
+     *
+     * \par
+     * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.).
+     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output
+     * int  num_items;              // e.g., 8
+     * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+     * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
+     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int  *d_num_selected_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
+     *
+     * // d_out                 <-- [1, 4, 6, 7]
+     * // d_num_selected_out    <-- [4]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam FlagIterator         <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    FlagIterator,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Flagged(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
+        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType                SelectOp;       // Selection op (not used)
+        typedef NullType                EqualityOp;     // Equality operator (not used)
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_flags,
+            d_out,
+            d_num_selected_out,
+            SelectOp(),
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Uses the \p select_op functor to selectively copy items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_logo.png)
+     *
+     * \par
+     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated select-if performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.  Items are
+     * selected with 50% probability.
+     *
+     * \image html select_if_int32_50_percent.png
+     * \image html select_if_int64_50_percent.png
+     *
+     * \par
+     * The following charts are similar, but 5% selection probability:
+     *
+     * \image html select_if_int32_5_percent.png
+     * \image html select_if_int64_5_percent.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
+     *
+     * // Functor type for selecting values less than some criteria
+     * struct LessThan
+     * {
+     *     int compare;
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     LessThan(int compare) : compare(compare) {}
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     bool operator()(const int &a) const {
+     *         return (a < compare);
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int      num_items;              // e.g., 8
+     * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
+     * int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int      *d_num_selected_out;    // e.g., [ ]
+     * LessThan select_op(7);
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
+     *
+     * // d_out                 <-- [0, 2, 3, 5, 2]
+     * // d_num_selected_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     * \tparam SelectOp             <b>[inferred]</b> Selection operator type having member <tt>bool operator()(const T &a)</tt>
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT,
+        typename                    SelectOp>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t If(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
+        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        SelectOp                    select_op,                      ///< [in] Unary selection operator
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
+        typedef NullType                EqualityOp;     // Equality operator (not used)
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            NULL,
+            d_out,
+            d_num_selected_out,
+            select_op,
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Given an input sequence \p d_in having runs of consecutive equal-valued keys, only the first key from each run is selectively copied to \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](unique_logo.png)
+     *
+     * \par
+     * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
+     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated select-unique performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.  Segments have
+     * lengths uniformly sampled from [1,1000].
+     *
+     * \image html select_unique_int32_len_500.png
+     * \image html select_unique_int64_len_500.png
+     *
+     * \par
+     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
+     *
+     * \image html select_unique_int32_len_5.png
+     * \image html select_unique_int64_len_5.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;              // e.g., 8
+     * int  *d_in;                  // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int  *d_num_selected_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items);
+     *
+     * // d_out                 <-- [0, 2, 9, 5, 8]
+     * // d_num_selected_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Unique(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT         d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
+        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
+        typedef NullType                SelectOp;       // Selection op (not used)
+        typedef Equality                EqualityOp;     // Default == operator
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            NULL,
+            d_out,
+            d_num_selected_out,
+            SelectOp(),
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+};
+
+/**
+ * \example example_device_select_flagged.cu
+ * \example example_device_select_if.cu
+ * \example example_device_select_unique.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/cpp/nvgraph/external/cub_semiring/device/device_spmv.cuh b/cpp/nvgraph/external/cub_semiring/device/device_spmv.cuh
new file mode 100644
index 00000000000..13e6b49ddd2
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/device/device_spmv.cuh
@@ -0,0 +1,177 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV).
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+#include <limits>
+
+#include "dispatch/dispatch_spmv_orig.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * dense-vector multiplication (SpMV).
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * The [<em>SpMV computation</em>](http://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication)
+ * performs the matrix-vector operation
+ * <em>y</em> = <em>alpha</em>*<b>A</b>*<em>x</em> + <em>beta</em>*<em>y</em>,
+ * where:
+ *  - <b>A</b> is an <em>m</em>x<em>n</em> sparse matrix whose non-zero structure is specified in
+ *    [<em>compressed-storage-row (CSR) format</em>](http://en.wikipedia.org/wiki/Sparse_matrix#Compressed_row_Storage_.28CRS_or_CSR.29)
+ *    (i.e., three arrays: <em>values</em>, <em>row_offsets</em>, and <em>column_indices</em>)
+ *  - <em>x</em> and <em>y</em> are dense vectors
+ *  - <em>alpha</em> and <em>beta</em> are scalar multiplicands
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceSpmv}
+ *
+ */
+struct DeviceSpmv
+{
+    /******************************************************************//**
+     * \name CSR matrix operations
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief This function performs the matrix-vector operation <em>y</em> = <b>A</b>*<em>x</em>.
+     *
+     * \par Snippet
+     * The code snippet below illustrates SpMV upon a 9x9 CSR matrix <b>A</b>
+     * representing a 3x3 lattice (24 non-zeros).
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_spmv.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input matrix A, input vector x,
+     * // and output vector y
+     * int    num_rows = 9;
+     * int    num_cols = 9;
+     * int    num_nonzeros = 24;
+     *
+     * float* d_values;  // e.g., [1, 1, 1, 1, 1, 1, 1, 1,
+     *                   //        1, 1, 1, 1, 1, 1, 1, 1,
+     *                   //        1, 1, 1, 1, 1, 1, 1, 1]
+     *
+     * int*   d_column_indices; // e.g., [1, 3, 0, 2, 4, 1, 5, 0,
+     *                          //        4, 6, 1, 3, 5, 7, 2, 4,
+     *                          //        8, 3, 7, 4, 6, 8, 5, 7]
+     *
+     * int*   d_row_offsets;    // e.g., [0, 2, 5, 7, 10, 14, 17, 19, 22, 24]
+     *
+     * float* d_vector_x;       // e.g., [1, 1, 1, 1, 1, 1, 1, 1, 1]
+     * float* d_vector_y;       // e.g., [ ,  ,  ,  ,  ,  ,  ,  ,  ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
+     *     d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
+     *     num_rows, num_cols, num_nonzeros, alpha, beta);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run SpMV
+     * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
+     *     d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
+     *     num_rows, num_cols, num_nonzeros, alpha, beta);
+     *
+     * // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2]
+     *
+     * \endcode
+     *
+     * \tparam ValueT       <b>[inferred]</b> Matrix and vector value type (e.g., /p float, /p double, etc.)
+     */
+    template <
+        typename            ValueT,
+        typename            SemiringT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t CsrMV(
+        void*               d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const ValueT*             d_values,                           ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
+        const int*                d_row_offsets,                      ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros)
+        const int*                d_column_indices,                   ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
+        const ValueT*             d_vector_x,                         ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+        ValueT*             d_vector_y,                         ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
+        ValueT              alpha,
+        ValueT              beta,
+        int                 num_rows,                           ///< [in] number of rows of matrix <b>A</b>.
+        int                 num_cols,                           ///< [in] number of columns of matrix <b>A</b>.
+        int                 num_nonzeros,                       ///< [in] number of nonzero elements of matrix <b>A</b>.
+        cudaStream_t        stream                  = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        SpmvParams<ValueT, int> spmv_params;
+        spmv_params.d_values             = d_values;
+        spmv_params.d_row_end_offsets    = d_row_offsets + 1;
+        spmv_params.d_column_indices     = d_column_indices;
+        spmv_params.d_vector_x           = d_vector_x;
+        spmv_params.d_vector_y           = d_vector_y;
+        spmv_params.num_rows             = num_rows;
+        spmv_params.num_cols             = num_cols;
+        spmv_params.num_nonzeros         = num_nonzeros;
+        spmv_params.alpha                = alpha;
+        spmv_params.beta                 = beta;
+
+        return DispatchSpmv<ValueT, int, SemiringT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            spmv_params,
+            stream,
+            debug_synchronous);
+    }
+
+    //@}  end member group
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_histogram.cuh b/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_histogram.cuh
new file mode 100644
index 00000000000..cdebd8b8555
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_histogram.cuh
@@ -0,0 +1,1096 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+#include <limits>
+
+#include "../../agent/agent_histogram.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../thread/thread_search.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/******************************************************************************
+ * Histogram kernel entry points
+ *****************************************************************************/
+
+/**
+ * Histogram initialization kernel entry point
+ */
+template <
+    int                                             NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename                                        CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
+    typename                                        OffsetT>                        ///< Signed integer type for global offsets
+__global__ void DeviceHistogramInitKernel(
+    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>          num_output_bins_wrapper,        ///< Number of output histogram bins per channel
+    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>    d_output_histograms_wrapper,    ///< Histogram counter data having logical dimensions <tt>CounterT[NUM_ACTIVE_CHANNELS][num_bins.array[CHANNEL]]</tt>
+    GridQueue<int>                                  tile_queue)                     ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
+{
+    if ((threadIdx.x == 0) && (blockIdx.x == 0))
+        tile_queue.ResetDrain();
+
+    int output_bin = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+    #pragma unroll
+    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+    {
+        if (output_bin < num_output_bins_wrapper.array[CHANNEL])
+            d_output_histograms_wrapper.array[CHANNEL][output_bin] = 0;
+    }
+}
+
+
+/**
+ * Histogram privatized sweep kernel entry point (multi-block).  Computes privatized histograms, one per thread block.
+ */
+template <
+    typename                                            AgentHistogramPolicyT,     ///< Parameterized AgentHistogramPolicy tuning policy type
+    int                                                 PRIVATIZED_SMEM_BINS,           ///< Maximum number of histogram bins per channel (e.g., up to 256)
+    int                                                 NUM_CHANNELS,                   ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+    int                                                 NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename                                            SampleIteratorT,                ///< The input iterator type. \iterator.
+    typename                                            CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
+    typename                                            PrivatizedDecodeOpT,            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
+    typename                                            OutputDecodeOpT,                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
+    typename                                            OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int(AgentHistogramPolicyT::BLOCK_THREADS))
+__global__ void DeviceHistogramSweepKernel(
+    SampleIteratorT                                         d_samples,                          ///< Input data to reduce
+    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>                  num_output_bins_wrapper,            ///< The number bins per final output histogram
+    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>                  num_privatized_bins_wrapper,        ///< The number bins per privatized histogram
+    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>            d_output_histograms_wrapper,        ///< Reference to final output histograms
+    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>            d_privatized_histograms_wrapper,    ///< Reference to privatized histograms
+    ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS>      output_decode_op_wrapper,           ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel
+    ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS>  privatized_decode_op_wrapper,       ///< The transform operator for determining privatized counter indices from samples, one for each channel
+    OffsetT                                                 num_row_pixels,                     ///< The number of multi-channel pixels per row in the region of interest
+    OffsetT                                                 num_rows,                           ///< The number of rows in the region of interest
+    OffsetT                                                 row_stride_samples,                 ///< The number of samples between starts of consecutive rows in the region of interest
+    int                                                     tiles_per_row,                      ///< Number of image tiles per row
+    GridQueue<int>                                          tile_queue)                         ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
+{
+    // Thread block type for compositing input tiles
+    typedef AgentHistogram<
+            AgentHistogramPolicyT,
+            PRIVATIZED_SMEM_BINS,
+            NUM_CHANNELS,
+            NUM_ACTIVE_CHANNELS,
+            SampleIteratorT,
+            CounterT,
+            PrivatizedDecodeOpT,
+            OutputDecodeOpT,
+            OffsetT>
+        AgentHistogramT;
+
+    // Shared memory for AgentHistogram
+    __shared__ typename AgentHistogramT::TempStorage temp_storage;
+
+    AgentHistogramT agent(
+        temp_storage,
+        d_samples,
+        num_output_bins_wrapper.array,
+        num_privatized_bins_wrapper.array,
+        d_output_histograms_wrapper.array,
+        d_privatized_histograms_wrapper.array,
+        output_decode_op_wrapper.array,
+        privatized_decode_op_wrapper.array);
+
+    // Initialize counters
+    agent.InitBinCounters();
+
+    // Consume input tiles
+    agent.ConsumeTiles(
+        num_row_pixels,
+        num_rows,
+        row_stride_samples,
+        tiles_per_row,
+        tile_queue);
+
+    // Store output to global (if necessary)
+    agent.StoreOutput();
+
+}
+
+
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram
+ */
+template <
+    int         NUM_CHANNELS,               ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+    int         NUM_ACTIVE_CHANNELS,        ///< Number of channels actively being histogrammed
+    typename    SampleIteratorT,            ///< Random-access input iterator type for reading input items \iterator
+    typename    CounterT,                   ///< Integer type for counting sample occurrences per histogram bin
+    typename    LevelT,                     ///< Type for specifying bin level boundaries
+    typename    OffsetT>                    ///< Signed integer type for global offsets
+struct DipatchHistogram
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The sample value type of the input iterator
+    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+    enum
+    {
+        // Maximum number of bins per channel for which we will use a privatized smem strategy
+        MAX_PRIVATIZED_SMEM_BINS = 256
+    };
+
+
+    //---------------------------------------------------------------------
+    // Transform functors for converting samples to bin-ids
+    //---------------------------------------------------------------------
+
+    // Searches for bin given a list of bin-boundary levels
+    template <typename LevelIteratorT>
+    struct SearchTransform
+    {
+        LevelIteratorT  d_levels;                   // Pointer to levels array
+        int             num_output_levels;          // Number of levels in array
+
+        // Initializer
+        __host__ __device__ __forceinline__ void Init(
+            LevelIteratorT  d_levels,               // Pointer to levels array
+            int             num_output_levels)      // Number of levels in array
+        {
+            this->d_levels          = d_levels;
+            this->num_output_levels = num_output_levels;
+        }
+
+        // Method for converting samples to bin-ids
+        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
+        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
+        {
+            /// Level iterator wrapper type
+            typedef typename If<IsPointer<LevelIteratorT>::VALUE,
+                    CacheModifiedInputIterator<LOAD_MODIFIER, LevelT, OffsetT>,     // Wrap the native input pointer with CacheModifiedInputIterator
+                    LevelIteratorT>::Type                                           // Directly use the supplied input iterator type
+                WrappedLevelIteratorT;
+
+            WrappedLevelIteratorT wrapped_levels(d_levels);
+
+            int num_bins = num_output_levels - 1;
+            if (valid)
+            {
+                bin = UpperBound(wrapped_levels, num_output_levels, (LevelT) sample) - 1;
+                if (bin >= num_bins)
+                    bin = -1;
+            }
+        }
+    };
+
+
+    // Scales samples to evenly-spaced bins
+    struct ScaleTransform
+    {
+        int    num_bins;    // Number of levels in array
+        LevelT max;         // Max sample level (exclusive)
+        LevelT min;         // Min sample level (inclusive)
+        LevelT scale;       // Bin scaling factor
+
+        // Initializer
+        template <typename _LevelT>
+        __host__ __device__ __forceinline__ void Init(
+            int     num_output_levels,  // Number of levels in array
+            _LevelT max,                // Max sample level (exclusive)
+            _LevelT min,                // Min sample level (inclusive)
+            _LevelT scale)              // Bin scaling factor
+        {
+            this->num_bins = num_output_levels - 1;
+            this->max = max;
+            this->min = min;
+            this->scale = scale;
+        }
+
+        // Initializer (float specialization)
+        __host__ __device__ __forceinline__ void Init(
+            int    num_output_levels,   // Number of levels in array
+            float   max,                // Max sample level (exclusive)
+            float   min,                // Min sample level (inclusive)
+            float   scale)              // Bin scaling factor
+        {
+            this->num_bins = num_output_levels - 1;
+            this->max = max;
+            this->min = min;
+            this->scale = float(1.0) / scale;
+        }
+
+        // Initializer (double specialization)
+        __host__ __device__ __forceinline__ void Init(
+            int    num_output_levels,   // Number of levels in array
+            double max,                 // Max sample level (exclusive)
+            double min,                 // Min sample level (inclusive)
+            double scale)               // Bin scaling factor
+        {
+            this->num_bins = num_output_levels - 1;
+            this->max = max;
+            this->min = min;
+            this->scale = double(1.0) / scale;
+        }
+
+        // Method for converting samples to bin-ids
+        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
+        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
+        {
+            LevelT level_sample = (LevelT) sample;
+
+            if (valid && (level_sample >= min) && (level_sample < max))
+                bin = (int) ((level_sample - min) / scale);
+        }
+
+        // Method for converting samples to bin-ids (float specialization)
+        template <CacheLoadModifier LOAD_MODIFIER>
+        __host__ __device__ __forceinline__ void BinSelect(float sample, int &bin, bool valid)
+        {
+            LevelT level_sample = (LevelT) sample;
+
+            if (valid && (level_sample >= min) && (level_sample < max))
+                bin = (int) ((level_sample - min) * scale);
+        }
+
+        // Method for converting samples to bin-ids (double specialization)
+        template <CacheLoadModifier LOAD_MODIFIER>
+        __host__ __device__ __forceinline__ void BinSelect(double sample, int &bin, bool valid)
+        {
+            LevelT level_sample = (LevelT) sample;
+
+            if (valid && (level_sample >= min) && (level_sample < max))
+                bin = (int) ((level_sample - min) * scale);
+        }
+    };
+
+
+    // Pass-through bin transform operator
+    struct PassThruTransform
+    {
+        // Method for converting samples to bin-ids
+        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
+        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
+        {
+            if (valid)
+                bin = (int) sample;
+        }
+    };
+
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies
+    //---------------------------------------------------------------------
+
+    template <int NOMINAL_ITEMS_PER_THREAD>
+    struct TScale
+    {
+        enum
+        {
+            V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int),
+            VALUE   = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1)
+        };
+    };
+
+
+    /// SM11
+    struct Policy110
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                512,
+                (NUM_CHANNELS == 1) ? 8 : 2,
+                BLOCK_LOAD_DIRECT,
+                LOAD_DEFAULT,
+                true,
+                GMEM,
+                false>
+            HistogramSweepPolicy;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                (NUM_CHANNELS == 1) ? 256 : 128,
+                (NUM_CHANNELS == 1) ? 8 : 3,
+                (NUM_CHANNELS == 1) ? BLOCK_LOAD_DIRECT : BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                SMEM,
+                false>
+            HistogramSweepPolicy;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                512,
+                (NUM_CHANNELS == 1) ? 8 : 2,
+                BLOCK_LOAD_DIRECT,
+                LOAD_DEFAULT,
+                true,
+                GMEM,
+                false>
+            HistogramSweepPolicy;
+    };
+
+    /// SM35
+    struct Policy350
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                128,
+                TScale<8>::VALUE,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                true,
+                BLEND,
+                true>
+            HistogramSweepPolicy;
+    };
+
+    /// SM50
+    struct Policy500
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                384,
+                TScale<16>::VALUE,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                true,
+                SMEM,
+                false>
+            HistogramSweepPolicy;
+    };
+
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies of current PTX compiler pass
+    //---------------------------------------------------------------------
+
+#if (CUB_PTX_ARCH >= 500)
+    typedef Policy500 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#else
+    typedef Policy110 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxHistogramSweepPolicy : PtxPolicy::HistogramSweepPolicy {};
+
+
+    //---------------------------------------------------------------------
+    // Utilities
+    //---------------------------------------------------------------------
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t InitConfigs(
+        int             ptx_version,
+        KernelConfig    &histogram_sweep_config)
+    {
+    #if (CUB_PTX_ARCH > 0)
+
+        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+        return histogram_sweep_config.template Init<PtxHistogramSweepPolicy>();
+
+    #else
+
+        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+        if (ptx_version >= 500)
+        {
+            return histogram_sweep_config.template Init<typename Policy500::HistogramSweepPolicy>();
+        }
+        else if (ptx_version >= 350)
+        {
+            return histogram_sweep_config.template Init<typename Policy350::HistogramSweepPolicy>();
+        }
+        else if (ptx_version >= 300)
+        {
+            return histogram_sweep_config.template Init<typename Policy300::HistogramSweepPolicy>();
+        }
+        else if (ptx_version >= 200)
+        {
+            return histogram_sweep_config.template Init<typename Policy200::HistogramSweepPolicy>();
+        }
+        else if (ptx_version >= 110)
+        {
+            return histogram_sweep_config.template Init<typename Policy110::HistogramSweepPolicy>();
+        }
+        else
+        {
+            // No global atomic support
+            return cudaErrorNotSupported;
+        }
+
+    #endif
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration
+     */
+    struct KernelConfig
+    {
+        int                             block_threads;
+        int                             pixels_per_thread;
+
+        template <typename BlockPolicy>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        cudaError_t Init()
+        {
+            block_threads               = BlockPolicy::BLOCK_THREADS;
+            pixels_per_thread           = BlockPolicy::PIXELS_PER_THREAD;
+
+            return cudaSuccess;
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Dispatch entrypoints
+    //---------------------------------------------------------------------
+
+    /**
+     * Privatization-based dispatch routine
+     */
+    template <
+        typename                            PrivatizedDecodeOpT,                            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
+        typename                            OutputDecodeOpT,                                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
+        typename                            DeviceHistogramInitKernelT,                     ///< Function type of cub::DeviceHistogramInitKernel
+        typename                            DeviceHistogramSweepKernelT>                    ///< Function type of cub::DeviceHistogramSweepKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t PrivatizedDispatch(
+        void*                               d_temp_storage,                                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                             temp_storage_bytes,                             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT                     d_samples,                                      ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*                           d_output_histograms[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                                 num_privatized_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        PrivatizedDecodeOpT                 privatized_decode_op[NUM_ACTIVE_CHANNELS],      ///< [in] Transform operators for determining bin-ids from samples, one for each channel
+        int                                 num_output_levels[NUM_ACTIVE_CHANNELS],         ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        OutputDecodeOpT                     output_decode_op[NUM_ACTIVE_CHANNELS],          ///< [in] Transform operators for determining bin-ids from samples, one for each channel
+        int                                 max_num_output_bins,                            ///< [in] Maximum number of output bins in any channel
+        OffsetT                             num_row_pixels,                                 ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT                             num_rows,                                       ///< [in] The number of rows in the region of interest
+        OffsetT                             row_stride_samples,                             ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        DeviceHistogramInitKernelT          histogram_init_kernel,                          ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramInitKernel
+        DeviceHistogramSweepKernelT         histogram_sweep_kernel,                         ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramSweepKernel
+        KernelConfig                        histogram_sweep_config,                         ///< [in] Dispatch parameters that match the policy that \p histogram_sweep_kernel was compiled for
+        cudaStream_t                        stream,                                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                                debug_synchronous)                              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+    #ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+    #else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Get SM occupancy for histogram_sweep_kernel
+            int histogram_sweep_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                histogram_sweep_sm_occupancy,
+                histogram_sweep_kernel,
+                histogram_sweep_config.block_threads))) break;
+
+            // Get device occupancy for histogram_sweep_kernel
+            int histogram_sweep_occupancy = histogram_sweep_sm_occupancy * sm_count;
+
+            if (num_row_pixels * NUM_CHANNELS == row_stride_samples)
+            {
+                // Treat as a single linear array of samples
+                num_row_pixels      *= num_rows;
+                num_rows            = 1;
+                row_stride_samples  = num_row_pixels * NUM_CHANNELS;
+            }
+
+            // Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy
+            int pixels_per_tile     = histogram_sweep_config.block_threads * histogram_sweep_config.pixels_per_thread;
+            int tiles_per_row       = int(num_row_pixels + pixels_per_tile - 1) / pixels_per_tile;
+            int blocks_per_row      = CUB_MIN(histogram_sweep_occupancy, tiles_per_row);
+            int blocks_per_col      = (blocks_per_row > 0) ?
+                                        int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) :
+                                        0;
+            int num_thread_blocks   = blocks_per_row * blocks_per_col;
+
+            dim3 sweep_grid_dims;
+            sweep_grid_dims.x = (unsigned int) blocks_per_row;
+            sweep_grid_dims.y = (unsigned int) blocks_per_col;
+            sweep_grid_dims.z = 1;
+
+            // Temporary storage allocation requirements
+            const int   NUM_ALLOCATIONS = NUM_ACTIVE_CHANNELS + 1;
+            void*       allocations[NUM_ALLOCATIONS];
+            size_t      allocation_sizes[NUM_ALLOCATIONS];
+
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                allocation_sizes[CHANNEL] = size_t(num_thread_blocks) * (num_privatized_levels[CHANNEL] - 1) * sizeof(CounterT);
+
+            allocation_sizes[NUM_ALLOCATIONS - 1] = GridQueue<int>::AllocationSize();
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the grid queue descriptor
+            GridQueue<int> tile_queue(allocations[NUM_ALLOCATIONS - 1]);
+
+            // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                d_output_histograms_wrapper.array[CHANNEL] = d_output_histograms[CHANNEL];
+
+            // Setup array wrapper for privatized per-block histogram channel output (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_privatized_histograms_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                d_privatized_histograms_wrapper.array[CHANNEL] = (CounterT*) allocations[CHANNEL];
+
+            // Setup array wrapper for sweep bin transforms (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS> privatized_decode_op_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                privatized_decode_op_wrapper.array[CHANNEL] = privatized_decode_op[CHANNEL];
+
+            // Setup array wrapper for aggregation bin transforms (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS> output_decode_op_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                output_decode_op_wrapper.array[CHANNEL] = output_decode_op[CHANNEL];
+
+            // Setup array wrapper for num privatized bins (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_privatized_bins_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                num_privatized_bins_wrapper.array[CHANNEL] = num_privatized_levels[CHANNEL] - 1;
+
+            // Setup array wrapper for num output bins (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                num_output_bins_wrapper.array[CHANNEL] = num_output_levels[CHANNEL] - 1;
+
+            int histogram_init_block_threads    = 256;
+            int histogram_init_grid_dims        = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads;
+
+            // Log DeviceHistogramInitKernel configuration
+            if (debug_synchronous) _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n",
+                histogram_init_grid_dims, histogram_init_block_threads, (long long) stream);
+
+            // Invoke histogram_init_kernel
+            histogram_init_kernel<<<histogram_init_grid_dims, histogram_init_block_threads, 0, stream>>>(
+                num_output_bins_wrapper,
+                d_output_histograms_wrapper,
+                tile_queue);
+
+            // Return if empty problem
+            if ((blocks_per_row == 0) || (blocks_per_col == 0))
+                break;
+
+            // Log histogram_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels per thread, %d SM occupancy\n",
+                sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z,
+                histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy);
+
+            // Invoke histogram_sweep_kernel
+            histogram_sweep_kernel<<<sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream>>>(
+                d_samples,
+                num_output_bins_wrapper,
+                num_privatized_bins_wrapper,
+                d_output_histograms_wrapper,
+                d_privatized_histograms_wrapper,
+                output_decode_op_wrapper,
+                privatized_decode_op_wrapper,
+                num_row_pixels,
+                num_rows,
+                row_stride_samples,
+                tiles_per_row,
+                tile_queue);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+        }
+        while (0);
+
+        return error;
+
+    #endif // CUB_RUNTIME_ENABLED
+    }
+
+
+
+    /**
+     * Dispatch routine for HistogramRange, specialized for sample types larger than 8bit
+     */
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t DispatchRange(
+        void*               d_temp_storage,                                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],      ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+        Int2Type<false>     is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel dispatch configurations
+            KernelConfig histogram_sweep_config;
+            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
+                break;
+
+            // Use the search transform op for converting samples to privatized bins
+            typedef SearchTransform<LevelT*> PrivatizedDecodeOpT;
+
+            // Use the pass-thru transform op for converting privatized bins to output bins
+            typedef PassThruTransform OutputDecodeOpT;
+
+            PrivatizedDecodeOpT     privatized_decode_op[NUM_ACTIVE_CHANNELS];
+            OutputDecodeOpT         output_decode_op[NUM_ACTIVE_CHANNELS];
+            int                     max_levels = num_output_levels[0];
+
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                privatized_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
+                if (num_output_levels[channel] > max_levels)
+                    max_levels = num_output_levels[channel];
+            }
+            int max_num_output_bins = max_levels - 1;
+
+            // Dispatch
+            if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
+            {
+                // Too many bins to keep in shared memory.
+                const int PRIVATIZED_SMEM_BINS = 0;
+
+                if (CubDebug(error = PrivatizedDispatch(
+                    d_temp_storage,
+                    temp_storage_bytes,
+                    d_samples,
+                    d_output_histograms,
+                    num_output_levels,
+                    privatized_decode_op,
+                    num_output_levels,
+                    output_decode_op,
+                    max_num_output_bins,
+                    num_row_pixels,
+                    num_rows,
+                    row_stride_samples,
+                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                    histogram_sweep_config,
+                    stream,
+                    debug_synchronous))) break;
+            }
+            else
+            {
+                // Dispatch shared-privatized approach
+                const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
+
+                if (CubDebug(error = PrivatizedDispatch(
+                    d_temp_storage,
+                    temp_storage_bytes,
+                    d_samples,
+                    d_output_histograms,
+                    num_output_levels,
+                    privatized_decode_op,
+                    num_output_levels,
+                    output_decode_op,
+                    max_num_output_bins,
+                    num_row_pixels,
+                    num_rows,
+                    row_stride_samples,
+                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                    histogram_sweep_config,
+                    stream,
+                    debug_synchronous))) break;
+            }
+
+        } while (0);
+
+        return error;
+    }
+
+
+    /**
+     * Dispatch routine for HistogramRange, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
+     */
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t DispatchRange(
+        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],   ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+        Int2Type<true>      is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel dispatch configurations
+            KernelConfig histogram_sweep_config;
+            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
+                break;
+
+            // Use the pass-thru transform op for converting samples to privatized bins
+            typedef PassThruTransform PrivatizedDecodeOpT;
+
+            // Use the search transform op for converting privatized bins to output bins
+            typedef SearchTransform<LevelT*> OutputDecodeOpT;
+
+            int                         num_privatized_levels[NUM_ACTIVE_CHANNELS];
+            PrivatizedDecodeOpT         privatized_decode_op[NUM_ACTIVE_CHANNELS];
+            OutputDecodeOpT             output_decode_op[NUM_ACTIVE_CHANNELS];
+            int                         max_levels = num_output_levels[0];              // Maximum number of levels in any channel
+
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                num_privatized_levels[channel] = 257;
+                output_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
+
+                if (num_output_levels[channel] > max_levels)
+                    max_levels = num_output_levels[channel];
+            }
+            int max_num_output_bins = max_levels - 1;
+
+            const int PRIVATIZED_SMEM_BINS = 256;
+
+            if (CubDebug(error = PrivatizedDispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_samples,
+                d_output_histograms,
+                num_privatized_levels,
+                privatized_decode_op,
+                num_output_levels,
+                output_decode_op,
+                max_num_output_bins,
+                num_row_pixels,
+                num_rows,
+                row_stride_samples,
+                DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                histogram_sweep_config,
+                stream,
+                debug_synchronous))) break;
+
+        } while (0);
+
+        return error;
+    }
+
+
+    /**
+     * Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t DispatchEven(
+        void*               d_temp_storage,                            ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],  ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+        Int2Type<false>     is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel dispatch configurations
+            KernelConfig histogram_sweep_config;
+            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
+                break;
+
+            // Use the scale transform op for converting samples to privatized bins
+            typedef ScaleTransform PrivatizedDecodeOpT;
+
+            // Use the pass-thru transform op for converting privatized bins to output bins
+            typedef PassThruTransform OutputDecodeOpT;
+
+            PrivatizedDecodeOpT         privatized_decode_op[NUM_ACTIVE_CHANNELS];
+            OutputDecodeOpT             output_decode_op[NUM_ACTIVE_CHANNELS];
+            int                         max_levels = num_output_levels[0];
+
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                int     bins    = num_output_levels[channel] - 1;
+                LevelT  scale   = (upper_level[channel] - lower_level[channel]) / bins;
+
+                privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
+
+                if (num_output_levels[channel] > max_levels)
+                    max_levels = num_output_levels[channel];
+            }
+            int max_num_output_bins = max_levels - 1;
+
+            if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
+            {
+                // Dispatch shared-privatized approach
+                const int PRIVATIZED_SMEM_BINS = 0;
+
+                if (CubDebug(error = PrivatizedDispatch(
+                    d_temp_storage,
+                    temp_storage_bytes,
+                    d_samples,
+                    d_output_histograms,
+                    num_output_levels,
+                    privatized_decode_op,
+                    num_output_levels,
+                    output_decode_op,
+                    max_num_output_bins,
+                    num_row_pixels,
+                    num_rows,
+                    row_stride_samples,
+                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                    histogram_sweep_config,
+                    stream,
+                    debug_synchronous))) break;
+            }
+            else
+            {
+                // Dispatch shared-privatized approach
+                const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
+
+                if (CubDebug(error = PrivatizedDispatch(
+                    d_temp_storage,
+                    temp_storage_bytes,
+                    d_samples,
+                    d_output_histograms,
+                    num_output_levels,
+                    privatized_decode_op,
+                    num_output_levels,
+                    output_decode_op,
+                    max_num_output_bins,
+                    num_row_pixels,
+                    num_rows,
+                    row_stride_samples,
+                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                    histogram_sweep_config,
+                    stream,
+                    debug_synchronous))) break;
+            }
+        }
+        while (0);
+
+        return error;
+    }
+
+
+    /**
+     * Dispatch routine for HistogramEven, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t DispatchEven(
+        void*               d_temp_storage,                            ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],  ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+        Int2Type<true>      is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel dispatch configurations
+            KernelConfig histogram_sweep_config;
+            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
+                break;
+
+            // Use the pass-thru transform op for converting samples to privatized bins
+            typedef PassThruTransform PrivatizedDecodeOpT;
+
+            // Use the scale transform op for converting privatized bins to output bins
+            typedef ScaleTransform OutputDecodeOpT;
+
+            int                     num_privatized_levels[NUM_ACTIVE_CHANNELS];
+            PrivatizedDecodeOpT     privatized_decode_op[NUM_ACTIVE_CHANNELS];
+            OutputDecodeOpT         output_decode_op[NUM_ACTIVE_CHANNELS];
+            int                     max_levels = num_output_levels[0];
+
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                num_privatized_levels[channel] = 257;
+
+                int     bins    = num_output_levels[channel] - 1;
+                LevelT  scale   = (upper_level[channel] - lower_level[channel]) / bins;
+                output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
+
+                if (num_output_levels[channel] > max_levels)
+                    max_levels = num_output_levels[channel];
+            }
+            int max_num_output_bins = max_levels - 1;
+
+            const int PRIVATIZED_SMEM_BINS = 256;
+
+            if (CubDebug(error = PrivatizedDispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_samples,
+                d_output_histograms,
+                num_privatized_levels,
+                privatized_decode_op,
+                num_output_levels,
+                output_decode_op,
+                max_num_output_bins,
+                num_row_pixels,
+                num_rows,
+                row_stride_samples,
+                DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                histogram_sweep_config,
+                stream,
+                debug_synchronous))) break;
+
+        }
+        while (0);
+
+        return error;
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_radix_sort.cuh b/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_radix_sort.cuh
new file mode 100644
index 00000000000..f9793ebd53e
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_radix_sort.cuh
@@ -0,0 +1,1652 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../agent/agent_radix_sort_upsweep.cuh"
+#include "../../agent/agent_radix_sort_downsweep.cuh"
+#include "../../agent/agent_scan.cuh"
+#include "../../block/block_radix_sort.cuh"
+#include "../../grid/grid_even_share.cuh"
+#include "../../util_type.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Upsweep digit-counting kernel entry point (multi-block).  Computes privatized digit histograms, one per block.
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
+    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
+    typename                KeyT,                           ///< Key type
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int((ALT_DIGIT_BITS) ?
+    ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS :
+    ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS))
+__global__ void DeviceRadixSortUpsweepKernel(
+    const KeyT              *d_keys,                        ///< [in] Input keys buffer
+    OffsetT                 *d_spine,                       ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+    OffsetT                 /*num_items*/,                  ///< [in] Total number of input data items
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    int                     num_bits,                       ///< [in] Number of bits of current radix digit
+    GridEvenShare<OffsetT>  even_share)                     ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
+{
+    enum {
+        TILE_ITEMS = ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS *
+                        ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::ITEMS_PER_THREAD
+    };
+
+    // Parameterize AgentRadixSortUpsweep type for the current configuration
+    typedef AgentRadixSortUpsweep<
+            typename If<(ALT_DIGIT_BITS),
+                typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy,
+                typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>::Type,
+            KeyT,
+            OffsetT>
+        AgentRadixSortUpsweepT;
+
+    // Shared memory storage
+    __shared__ typename AgentRadixSortUpsweepT::TempStorage temp_storage;
+
+    // Initialize GRID_MAPPING_RAKE even-share descriptor for this thread block
+    even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_RAKE>();
+
+    AgentRadixSortUpsweepT upsweep(temp_storage, d_keys, current_bit, num_bits);
+
+    upsweep.ProcessRegion(even_share.block_offset, even_share.block_end);
+
+    CTA_SYNC();
+
+    // Write out digit counts (striped)
+    upsweep.ExtractCounts<IS_DESCENDING>(d_spine, gridDim.x, blockIdx.x);
+}
+
+
+/**
+ * Spine scan kernel entry point (single-block).  Computes an exclusive prefix sum over the privatized digit histograms
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1)
+__global__ void RadixSortScanBinsKernel(
+    OffsetT                 *d_spine,                       ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+    int                     num_counts)                     ///< [in] Total number of bin-counts
+{
+    // Parameterize the AgentScan type for the current configuration
+    typedef AgentScan<
+            typename ChainedPolicyT::ActivePolicy::ScanPolicy,
+            OffsetT*,
+            OffsetT*,
+            cub::Sum,
+            OffsetT,
+            OffsetT>
+        AgentScanT;
+
+    // Shared memory storage
+    __shared__ typename AgentScanT::TempStorage temp_storage;
+
+    // Block scan instance
+    AgentScanT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), OffsetT(0)) ;
+
+    // Process full input tiles
+    int block_offset = 0;
+    BlockScanRunningPrefixOp<OffsetT, Sum> prefix_op(0, Sum());
+    while (block_offset + AgentScanT::TILE_ITEMS <= num_counts)
+    {
+        block_scan.template ConsumeTile<false, false>(block_offset, prefix_op);
+        block_offset += AgentScanT::TILE_ITEMS;
+    }
+}
+
+
+/**
+ * Downsweep pass kernel entry point (multi-block).  Scatters keys (and values) into corresponding bins for the current digit place.
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
+    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
+    typename                KeyT,                           ///< Key type
+    typename                ValueT,                         ///< Value type
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int((ALT_DIGIT_BITS) ?
+    ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS :
+    ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS))
+__global__ void DeviceRadixSortDownsweepKernel(
+    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
+    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
+    const ValueT            *d_values_in,                   ///< [in] Input values buffer
+    ValueT                  *d_values_out,                  ///< [in] Output values buffer
+    OffsetT                 *d_spine,                       ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+    OffsetT                 num_items,                      ///< [in] Total number of input data items
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    int                     num_bits,                       ///< [in] Number of bits of current radix digit
+    GridEvenShare<OffsetT>  even_share)                     ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
+{
+    enum {
+        TILE_ITEMS = ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS *
+                        ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::ITEMS_PER_THREAD
+    };
+
+    // Parameterize AgentRadixSortDownsweep type for the current configuration
+    typedef AgentRadixSortDownsweep<
+            typename If<(ALT_DIGIT_BITS),
+                typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy,
+                typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>::Type,
+            IS_DESCENDING,
+            KeyT,
+            ValueT,
+            OffsetT>
+        AgentRadixSortDownsweepT;
+
+    // Shared memory storage
+    __shared__  typename AgentRadixSortDownsweepT::TempStorage temp_storage;
+
+    // Initialize even-share descriptor for this thread block
+    even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_RAKE>();
+
+    // Process input tiles
+    AgentRadixSortDownsweepT(temp_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, num_bits).ProcessRegion(
+        even_share.block_offset,
+        even_share.block_end);
+}
+
+
+/**
+ * Single pass kernel entry point (single-block).  Fully sorts a tile of input.
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
+    typename                KeyT,                           ///< Key type
+    typename                ValueT,                         ///< Value type
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
+__global__ void DeviceRadixSortSingleTileKernel(
+    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
+    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
+    const ValueT            *d_values_in,                   ///< [in] Input values buffer
+    ValueT                  *d_values_out,                  ///< [in] Output values buffer
+    OffsetT                 num_items,                      ///< [in] Total number of input data items
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    int                     end_bit)                        ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+{
+    // Constants
+    enum
+    {
+        BLOCK_THREADS           = ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = ChainedPolicyT::ActivePolicy::SingleTilePolicy::ITEMS_PER_THREAD,
+        KEYS_ONLY               = Equals<ValueT, NullType>::VALUE,
+    };
+
+    // BlockRadixSort type
+    typedef BlockRadixSort<
+            KeyT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            ValueT,
+            ChainedPolicyT::ActivePolicy::SingleTilePolicy::RADIX_BITS,
+            (ChainedPolicyT::ActivePolicy::SingleTilePolicy::RANK_ALGORITHM == RADIX_RANK_MEMOIZE),
+            ChainedPolicyT::ActivePolicy::SingleTilePolicy::SCAN_ALGORITHM>
+        BlockRadixSortT;
+
+    // BlockLoad type (keys)
+    typedef BlockLoad<
+        KeyT,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadKeys;
+
+    // BlockLoad type (values)
+    typedef BlockLoad<
+        ValueT,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadValues;
+
+    // Unsigned word for key bits
+    typedef typename Traits<KeyT>::UnsignedBits UnsignedBitsT;
+
+    // Shared memory storage
+    __shared__ union TempStorage
+    {
+        typename BlockRadixSortT::TempStorage       sort;
+        typename BlockLoadKeys::TempStorage         load_keys;
+        typename BlockLoadValues::TempStorage       load_values;
+
+    } temp_storage;
+
+    // Keys and values for the block
+    KeyT            keys[ITEMS_PER_THREAD];
+    ValueT          values[ITEMS_PER_THREAD];
+
+    // Get default (min/max) value for out-of-bounds keys
+    UnsignedBitsT   default_key_bits = (IS_DESCENDING) ? Traits<KeyT>::LOWEST_KEY : Traits<KeyT>::MAX_KEY;
+    KeyT            default_key = reinterpret_cast<KeyT&>(default_key_bits);
+
+    // Load keys
+    BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in, keys, num_items, default_key);
+
+    CTA_SYNC();
+
+    // Load values
+    if (!KEYS_ONLY)
+    {
+        BlockLoadValues(temp_storage.load_values).Load(d_values_in, values, num_items);
+
+        CTA_SYNC();
+    }
+
+    // Sort tile
+    BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(
+        keys,
+        values,
+        current_bit,
+        end_bit,
+        Int2Type<IS_DESCENDING>(),
+        Int2Type<KEYS_ONLY>());
+
+    // Store keys and values
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+        int item_offset = ITEM * BLOCK_THREADS + threadIdx.x;
+        if (item_offset < num_items)
+        {
+            d_keys_out[item_offset] = keys[ITEM];
+            if (!KEYS_ONLY)
+                d_values_out[item_offset] = values[ITEM];
+        }
+    }
+}
+
+
+/**
+ * Segmented radix sorting pass (one block per segment)
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
+    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
+    typename                KeyT,                           ///< Key type
+    typename                ValueT,                         ///< Value type
+    typename                OffsetIteratorT,                ///< Random-access input iterator type for reading segment offsets \iterator
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int((ALT_DIGIT_BITS) ?
+    ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS :
+    ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS))
+__global__ void DeviceSegmentedRadixSortKernel(
+    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
+    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
+    const ValueT            *d_values_in,                   ///< [in] Input values buffer
+    ValueT                  *d_values_out,                  ///< [in] Output values buffer
+    OffsetIteratorT         d_begin_offsets,                ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    OffsetIteratorT         d_end_offsets,                  ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    int                     /*num_segments*/,               ///< [in] The number of segments that comprise the sorting data
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    int                     pass_bits)                      ///< [in] Number of bits of current radix digit
+{
+    //
+    // Constants
+    //
+
+    typedef typename If<(ALT_DIGIT_BITS),
+        typename ChainedPolicyT::ActivePolicy::AltSegmentedPolicy,
+        typename ChainedPolicyT::ActivePolicy::SegmentedPolicy>::Type SegmentedPolicyT;
+
+    enum
+    {
+        BLOCK_THREADS       = SegmentedPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = SegmentedPolicyT::ITEMS_PER_THREAD,
+        RADIX_BITS          = SegmentedPolicyT::RADIX_BITS,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+        RADIX_DIGITS        = 1 << RADIX_BITS,
+        KEYS_ONLY           = Equals<ValueT, NullType>::VALUE,
+    };
+
+    // Upsweep type
+    typedef AgentRadixSortUpsweep<
+            AgentRadixSortUpsweepPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, SegmentedPolicyT::LOAD_MODIFIER, RADIX_BITS>,
+            KeyT,
+            OffsetT>
+        BlockUpsweepT;
+
+    // Digit-scan type
+    typedef BlockScan<OffsetT, BLOCK_THREADS> DigitScanT;
+
+    // Downsweep type
+    typedef AgentRadixSortDownsweep<SegmentedPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT> BlockDownsweepT;
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = BlockDownsweepT::BINS_TRACKED_PER_THREAD
+    };
+
+    //
+    // Process input tiles
+    //
+
+    // Shared memory storage
+    __shared__ union
+    {
+        typename BlockUpsweepT::TempStorage     upsweep;
+        typename BlockDownsweepT::TempStorage   downsweep;
+        struct
+        {
+            volatile OffsetT                        reverse_counts_in[RADIX_DIGITS];
+            volatile OffsetT                        reverse_counts_out[RADIX_DIGITS];
+            typename DigitScanT::TempStorage        scan;
+        };
+
+    } temp_storage;
+
+    OffsetT segment_begin   = d_begin_offsets[blockIdx.x];
+    OffsetT segment_end     = d_end_offsets[blockIdx.x];
+    OffsetT num_items       = segment_end - segment_begin;
+
+    // Check if empty segment
+    if (num_items <= 0)
+        return;
+
+    // Upsweep
+    BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits);
+    upsweep.ProcessRegion(segment_begin, segment_end);
+
+    CTA_SYNC();
+
+    // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads)
+    OffsetT bin_count[BINS_TRACKED_PER_THREAD];
+    upsweep.ExtractCounts(bin_count);
+
+    CTA_SYNC();
+
+    if (IS_DESCENDING)
+    {
+        // Reverse bin counts
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+                temp_storage.reverse_counts_in[bin_idx] = bin_count[track];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+                bin_count[track] = temp_storage.reverse_counts_in[RADIX_DIGITS - bin_idx - 1];
+        }
+    }
+
+    // Scan
+    OffsetT bin_offset[BINS_TRACKED_PER_THREAD];     // The global scatter base offset for each digit value in this pass (valid in the first RADIX_DIGITS threads)
+    DigitScanT(temp_storage.scan).ExclusiveSum(bin_count, bin_offset);
+
+    #pragma unroll
+    for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+    {
+        bin_offset[track] += segment_begin;
+    }
+
+    if (IS_DESCENDING)
+    {
+        // Reverse bin offsets
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+                temp_storage.reverse_counts_out[threadIdx.x] = bin_offset[track];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+                bin_offset[track] = temp_storage.reverse_counts_out[RADIX_DIGITS - bin_idx - 1];
+        }
+    }
+
+    CTA_SYNC();
+
+    // Downsweep
+    BlockDownsweepT downsweep(temp_storage.downsweep, bin_offset, num_items, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, pass_bits);
+    downsweep.ProcessRegion(segment_begin, segment_end);
+}
+
+
+
+/******************************************************************************
+ * Policy
+ ******************************************************************************/
+
+/**
+ * Tuning policy for kernel specialization
+ */
+template <
+    typename KeyT,          ///< Key type
+    typename ValueT,        ///< Value type
+    typename OffsetT>       ///< Signed integer type for global offsets
+struct DeviceRadixSortPolicy
+{
+    //------------------------------------------------------------------------------
+    // Constants
+    //------------------------------------------------------------------------------
+
+    enum
+    {
+        // Whether this is a keys-only (or key-value) sort
+        KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
+
+        // Relative size of KeyT type to a 4-byte word
+        SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4,
+    };
+
+    //------------------------------------------------------------------------------
+    // Architecture-specific tuning policies
+    //------------------------------------------------------------------------------
+
+    /// SM13
+    struct Policy130 : ChainedPolicy<130, Policy130, Policy130>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 5,
+            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
+        };
+
+        // Keys-only upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 19 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>   UpsweepPolicyKeys;
+        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>       AltUpsweepPolicyKeys;
+
+        // Key-value pairs upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 19 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>   UpsweepPolicyPairs;
+        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>       AltUpsweepPolicyPairs;
+
+        // Upsweep policies
+        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type         UpsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type   AltUpsweepPolicy;
+
+        // Scan policy
+        typedef AgentScanPolicy <256, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
+
+        // Keys-only downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 19 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>    DownsweepPolicyKeys;
+        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>       AltDownsweepPolicyKeys;
+
+        // Key-value pairs downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 19 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>    DownsweepPolicyPairs;
+        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>       AltDownsweepPolicyPairs;
+
+        // Downsweep policies
+        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type         DownsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type   AltDownsweepPolicy;
+
+        // Single-tile policy
+        typedef DownsweepPolicy SingleTilePolicy;
+
+        // Segmented policies
+        typedef DownsweepPolicy     SegmentedPolicy;
+        typedef AltDownsweepPolicy  AltSegmentedPolicy;
+    };
+
+    /// SM20
+    struct Policy200 : ChainedPolicy<200, Policy200, Policy130>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 5,
+            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
+        };
+
+        // Keys-only upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyKeys;
+        typedef AgentRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyKeys;
+
+        // Key-value pairs upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>   UpsweepPolicyPairs;
+        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>       AltUpsweepPolicyPairs;
+
+        // Upsweep policies
+        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type         UpsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type   AltUpsweepPolicy;
+
+        // Scan policy
+        typedef AgentScanPolicy <512, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Keys-only downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>    DownsweepPolicyKeys;
+        typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>        AltDownsweepPolicyKeys;
+
+        // Key-value pairs downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicyPairs;
+        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>       AltDownsweepPolicyPairs;
+
+        // Downsweep policies
+        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type         DownsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type   AltDownsweepPolicy;
+
+        // Single-tile policy
+        typedef DownsweepPolicy SingleTilePolicy;
+
+        // Segmented policies
+        typedef DownsweepPolicy     SegmentedPolicy;
+        typedef AltDownsweepPolicy  AltSegmentedPolicy;
+    };
+
+    /// SM30
+    struct Policy300 : ChainedPolicy<300, Policy300, Policy200>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 5,
+            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
+        };
+
+        // Keys-only upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyKeys;
+        typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyKeys;
+
+        // Key-value pairs upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyPairs;
+        typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyPairs;
+
+        // Upsweep policies
+        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type         UpsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type   AltUpsweepPolicy;
+
+        // Scan policy
+        typedef AgentScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
+
+        // Keys-only downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicyKeys;
+        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>       AltDownsweepPolicyKeys;
+
+        // Key-value pairs downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>    DownsweepPolicyPairs;
+        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>        AltDownsweepPolicyPairs;
+
+        // Downsweep policies
+        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type         DownsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type   AltDownsweepPolicy;
+
+        // Single-tile policy
+        typedef DownsweepPolicy SingleTilePolicy;
+
+        // Segmented policies
+        typedef DownsweepPolicy     SegmentedPolicy;
+        typedef AltDownsweepPolicy  AltSegmentedPolicy;
+    };
+
+
+    /// SM35
+    struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 6,    // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented keys/s (K40m)
+        };
+
+        // Scan policy
+        typedef AgentScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
+
+        // Keys-only downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <128,   CUB_MAX(1, 9 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_LDG, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys;
+        typedef AgentRadixSortDownsweepPolicy <64,   CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyKeys;
+
+        // Key-value pairs downsweep policies
+        typedef DownsweepPolicyKeys DownsweepPolicyPairs;
+        typedef AgentRadixSortDownsweepPolicy <128,  CUB_MAX(1, 15 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyPairs;
+
+        // Downsweep policies
+        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef DownsweepPolicy UpsweepPolicy;
+        typedef AltDownsweepPolicy AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef DownsweepPolicy SingleTilePolicy;
+
+        // Segmented policies
+        typedef DownsweepPolicy     SegmentedPolicy;
+        typedef AltDownsweepPolicy  AltSegmentedPolicy;
+
+
+    };
+
+
+    /// SM50
+    struct Policy500 : ChainedPolicy<500, Policy500, Policy350>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 7,    // 3.5B 32b keys/s, 1.92B 32b pairs/s (TitanX)
+            SINGLE_TILE_RADIX_BITS  = 6,
+            SEGMENTED_RADIX_BITS    = 6,    // 3.1B 32b segmented keys/s (TitanX)
+        };
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <160, CUB_MAX(1, 39 / SCALE_FACTOR_4B),  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>  DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 16 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef DownsweepPolicy UpsweepPolicy;
+        typedef AltDownsweepPolicy AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy;
+
+        // Segmented policies
+        typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 31 / SCALE_FACTOR_4B),  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>   SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 11 / SCALE_FACTOR_4B),  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1>       AltSegmentedPolicy;
+    };
+
+
+    /// SM60 (GP100)
+    struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 7,    // 6.9B 32b keys/s (Quadro P100)
+            SINGLE_TILE_RADIX_BITS  = 6,
+            SEGMENTED_RADIX_BITS    = 6,    // 5.9B 32b segmented keys/s (Quadro P100)
+        };
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 25 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 39 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef DownsweepPolicy UpsweepPolicy;
+        typedef AltDownsweepPolicy AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
+
+        // Segmented policies
+        typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 39 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 11 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
+
+    };
+
+
+    /// SM61 (GP104)
+    struct Policy610 : ChainedPolicy<610, Policy610, Policy600>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 7,    // 3.4B 32b keys/s, 1.83B 32b pairs/s (1080)
+            SINGLE_TILE_RADIX_BITS  = 6,
+            SEGMENTED_RADIX_BITS    = 6,    // 3.3B 32b segmented keys/s (1080)
+        };
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 31 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT,       LOAD_DEFAULT,       RADIX_RANK_MATCH,   BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 35 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE,    LOAD_DEFAULT,   RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 16 / SCALE_FACTOR_4B), LOAD_LDG, PRIMARY_RADIX_BITS>        UpsweepPolicy;
+        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 16 / SCALE_FACTOR_4B), LOAD_LDG, PRIMARY_RADIX_BITS - 1>    AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
+
+        // Segmented policies
+        typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 39 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 11 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
+    };
+
+
+    /// SM62 (Tegra, less RF)
+    struct Policy620 : ChainedPolicy<620, Policy620, Policy610>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 5,
+            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
+        };
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 16 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 16 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, ALT_RADIX_BITS>       AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef DownsweepPolicy UpsweepPolicy;
+        typedef AltDownsweepPolicy AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> SingleTilePolicy;
+
+        // Segmented policies
+        typedef DownsweepPolicy     SegmentedPolicy;
+        typedef AltDownsweepPolicy  AltSegmentedPolicy;
+    };
+
+
+    /// SM70 (GV100)
+    struct Policy700 : ChainedPolicy<700, Policy700, Policy620>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 6,    // 7.62B 32b keys/s (GV100)
+            SINGLE_TILE_RADIX_BITS  = 6,
+            SEGMENTED_RADIX_BITS    = 6,    // 8.7B 32b segmented keys/s (GV100)
+        };
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 47 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 29 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 47 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>  UpsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 29 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>  AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
+
+        // Segmented policies
+        typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 39 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 11 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
+    };
+
+
+    /// MaxPolicy
+    typedef Policy700 MaxPolicy;
+
+
+};
+
+
+
+/******************************************************************************
+ * Single-problem dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for device-wide radix sort
+ */
+template <
+    bool     IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low
+    typename KeyT,          ///< Key type
+    typename ValueT,        ///< Value type
+    typename OffsetT>       ///< Signed integer type for global offsets
+struct DispatchRadixSort :
+    DeviceRadixSortPolicy<KeyT, ValueT, OffsetT>
+{
+    //------------------------------------------------------------------------------
+    // Constants
+    //------------------------------------------------------------------------------
+
+    enum
+    {
+        // Whether this is a keys-only (or key-value) sort
+        KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
+    };
+
+
+    //------------------------------------------------------------------------------
+    // Problem state
+    //------------------------------------------------------------------------------
+
+    void                    *d_temp_storage;        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t                  &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    DoubleBuffer<KeyT>      &d_keys;                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+    DoubleBuffer<ValueT>    &d_values;              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+    OffsetT                 num_items;              ///< [in] Number of items to sort
+    int                     begin_bit;              ///< [in] The beginning (least-significant) bit index needed for key comparison
+    int                     end_bit;                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+    cudaStream_t            stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                    debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    int                     ptx_version;            ///< [in] PTX version
+    bool                    is_overwrite_okay;      ///< [in] Whether is okay to overwrite source buffers
+
+
+    //------------------------------------------------------------------------------
+    // Constructor
+    //------------------------------------------------------------------------------
+
+    /// Constructor
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchRadixSort(
+        void*                   d_temp_storage,
+        size_t                  &temp_storage_bytes,
+        DoubleBuffer<KeyT>      &d_keys,
+        DoubleBuffer<ValueT>    &d_values,
+        OffsetT                 num_items,
+        int                     begin_bit,
+        int                     end_bit,
+        bool                    is_overwrite_okay,
+        cudaStream_t            stream,
+        bool                    debug_synchronous,
+        int                     ptx_version)
+    :
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_keys(d_keys),
+        d_values(d_values),
+        num_items(num_items),
+        begin_bit(begin_bit),
+        end_bit(end_bit),
+        stream(stream),
+        debug_synchronous(debug_synchronous),
+        ptx_version(ptx_version),
+        is_overwrite_okay(is_overwrite_okay)
+    {}
+
+
+    //------------------------------------------------------------------------------
+    // Small-problem (single tile) invocation
+    //------------------------------------------------------------------------------
+
+    /// Invoke a single block to sort in-core
+    template <
+        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
+        typename                SingleTileKernelT>      ///< Function type of cub::DeviceRadixSortSingleTileKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokeSingleTile(
+        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)single_tile_kernel;
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+            {
+                temp_storage_bytes = 1;
+                break;
+            }
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Log single_tile_kernel configuration
+            if (debug_synchronous)
+                _CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
+                    1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream,
+                    ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS);
+
+            // Invoke upsweep_kernel with same grid size as downsweep_kernel
+            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
+                d_keys.Current(),
+                d_keys.Alternate(),
+                d_values.Current(),
+                d_values.Alternate(),
+                num_items,
+                begin_bit,
+                end_bit);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Update selector
+            d_keys.selector ^= 1;
+            d_values.selector ^= 1;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Normal problem size invocation
+    //------------------------------------------------------------------------------
+
+    /**
+     * Invoke a three-kernel sorting pass at the current bit.
+     */
+    template <typename PassConfigT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePass(
+        const KeyT      *d_keys_in,
+        KeyT            *d_keys_out,
+        const ValueT    *d_values_in,
+        ValueT          *d_values_out,
+        OffsetT         *d_spine,
+        int             spine_length,
+        int             &current_bit,
+        PassConfigT     &pass_config)
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
+
+            // Log upsweep_kernel configuration
+            if (debug_synchronous)
+                _CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
+                pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream,
+                pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits);
+
+            // Invoke upsweep_kernel with same grid size as downsweep_kernel
+            pass_config.upsweep_kernel<<<pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, 0, stream>>>(
+                d_keys_in,
+                d_spine,
+                num_items,
+                current_bit,
+                pass_bits,
+                pass_config.even_share);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Log scan_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
+                1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread);
+
+            // Invoke scan_kernel
+            pass_config.scan_kernel<<<1, pass_config.scan_config.block_threads, 0, stream>>>(
+                d_spine,
+                spine_length);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Log downsweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, (long long) stream,
+                pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy);
+
+            // Invoke downsweep_kernel
+            pass_config.downsweep_kernel<<<pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, 0, stream>>>(
+                d_keys_in,
+                d_keys_out,
+                d_values_in,
+                d_values_out,
+                d_spine,
+                num_items,
+                current_bit,
+                pass_bits,
+                pass_config.even_share);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Update current bit
+            current_bit += pass_bits;
+        }
+        while (0);
+
+        return error;
+    }
+
+
+
+    /// Pass configuration structure
+    template <
+        typename UpsweepKernelT,
+        typename ScanKernelT,
+        typename DownsweepKernelT>
+    struct PassConfig
+    {
+        UpsweepKernelT          upsweep_kernel;
+        KernelConfig            upsweep_config;
+        ScanKernelT             scan_kernel;
+        KernelConfig            scan_config;
+        DownsweepKernelT        downsweep_kernel;
+        KernelConfig            downsweep_config;
+        int                     radix_bits;
+        int                     radix_digits;
+        int                     max_downsweep_grid_size;
+        GridEvenShare<OffsetT>  even_share;
+
+        /// Initialize pass configuration
+        template <
+            typename UpsweepPolicyT,
+            typename ScanPolicyT,
+            typename DownsweepPolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        cudaError_t InitPassConfig(
+            UpsweepKernelT      upsweep_kernel,
+            ScanKernelT         scan_kernel,
+            DownsweepKernelT    downsweep_kernel,
+            int                 ptx_version,
+            int                 sm_count,
+            int                 num_items)
+        {
+            cudaError error = cudaSuccess;
+            do
+            {
+                this->upsweep_kernel    = upsweep_kernel;
+                this->scan_kernel       = scan_kernel;
+                this->downsweep_kernel  = downsweep_kernel;
+                radix_bits              = DownsweepPolicyT::RADIX_BITS;
+                radix_digits            = 1 << radix_bits;
+
+                if (CubDebug(error = upsweep_config.Init<UpsweepPolicyT>(upsweep_kernel))) break;
+                if (CubDebug(error = scan_config.Init<ScanPolicyT>(scan_kernel))) break;
+                if (CubDebug(error = downsweep_config.Init<DownsweepPolicyT>(downsweep_kernel))) break;
+
+                max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(ptx_version);
+
+                even_share.DispatchInit(
+                    num_items,
+                    max_downsweep_grid_size,
+                    CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size));
+
+            }
+            while (0);
+            return error;
+        }
+
+    };
+
+
+    /// Invocation (run multiple digit passes)
+    template <
+        typename            ActivePolicyT,          ///< Umbrella policy active for the target device
+        typename            UpsweepKernelT,         ///< Function type of cub::DeviceRadixSortUpsweepKernel
+        typename            ScanKernelT,            ///< Function type of cub::SpineScanKernel
+        typename            DownsweepKernelT>       ///< Function type of cub::DeviceRadixSortDownsweepKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePasses(
+        UpsweepKernelT      upsweep_kernel,         ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
+        UpsweepKernelT      alt_upsweep_kernel,     ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
+        ScanKernelT         scan_kernel,            ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel
+        DownsweepKernelT    downsweep_kernel,       ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel
+        DownsweepKernelT    alt_downsweep_kernel)   ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)upsweep_kernel;
+        (void)alt_upsweep_kernel;
+        (void)scan_kernel;
+        (void)downsweep_kernel;
+        (void)alt_downsweep_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Init regular and alternate-digit kernel configurations
+            PassConfig<UpsweepKernelT, ScanKernelT, DownsweepKernelT> pass_config, alt_pass_config;
+            if ((error = pass_config.template InitPassConfig<
+                    typename ActivePolicyT::UpsweepPolicy, 
+                    typename ActivePolicyT::ScanPolicy, 
+                    typename ActivePolicyT::DownsweepPolicy>(
+                upsweep_kernel, scan_kernel, downsweep_kernel, ptx_version, sm_count, num_items))) break;
+
+            if ((error = alt_pass_config.template InitPassConfig<
+                    typename ActivePolicyT::AltUpsweepPolicy, 
+                    typename ActivePolicyT::ScanPolicy, 
+                    typename ActivePolicyT::AltDownsweepPolicy>(
+                alt_upsweep_kernel, scan_kernel, alt_downsweep_kernel, ptx_version, sm_count, num_items))) break;
+
+            // Get maximum spine length
+            int max_grid_size       = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size);
+            int spine_length        = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size;
+
+            // Temporary storage allocation requirements
+            void* allocations[3];
+            size_t allocation_sizes[3] =
+            {
+                spine_length * sizeof(OffsetT),                                         // bytes needed for privatized block digit histograms
+                (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),                     // bytes needed for 3rd keys buffer
+                (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),    // bytes needed for 3rd values buffer
+            };
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+                return cudaSuccess;
+
+            // Pass planning.  Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
+            int num_bits            = end_bit - begin_bit;
+            int num_passes          = (num_bits + pass_config.radix_bits - 1) / pass_config.radix_bits;
+            bool is_num_passes_odd  = num_passes & 1;
+            int max_alt_passes      = (num_passes * pass_config.radix_bits) - num_bits;
+            int alt_end_bit         = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits));
+
+            // Alias the temporary storage allocations
+            OffsetT *d_spine = static_cast<OffsetT*>(allocations[0]);
+
+            DoubleBuffer<KeyT> d_keys_remaining_passes(
+                (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[1]),
+                (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[1]) : d_keys.Alternate());
+
+            DoubleBuffer<ValueT> d_values_remaining_passes(
+                (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[2]),
+                (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[2]) : d_values.Alternate());
+
+            // Run first pass, consuming from the input's current buffers
+            int current_bit = begin_bit;
+            if (CubDebug(error = InvokePass(
+                d_keys.Current(), d_keys_remaining_passes.Current(),
+                d_values.Current(), d_values_remaining_passes.Current(),
+                d_spine, spine_length, current_bit,
+                (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
+
+            // Run remaining passes
+            while (current_bit < end_bit)
+            {
+                if (CubDebug(error = InvokePass(
+                    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                    d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                    d_spine, spine_length, current_bit,
+                    (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;;
+
+                // Invert selectors
+                d_keys_remaining_passes.selector ^= 1;
+                d_values_remaining_passes.selector ^= 1;
+            }
+
+            // Update selector
+            if (!is_overwrite_okay) {
+                num_passes = 1; // Sorted data always ends up in the other vector
+            }
+
+            d_keys.selector = (d_keys.selector + num_passes) & 1;
+            d_values.selector = (d_values.selector + num_passes) & 1;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Chained policy invocation
+    //------------------------------------------------------------------------------
+
+    /// Invocation
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename DispatchRadixSort::MaxPolicy       MaxPolicyT;
+        typedef typename ActivePolicyT::SingleTilePolicy    SingleTilePolicyT;
+
+        // Force kernel code-generation in all compiler passes
+        if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
+        {
+            // Small, single tile size
+            return InvokeSingleTile<ActivePolicyT>(
+                DeviceRadixSortSingleTileKernel<MaxPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT>);
+        }
+        else
+        {
+            // Regular size
+            return InvokePasses<ActivePolicyT>(
+                DeviceRadixSortUpsweepKernel<   MaxPolicyT, false,   IS_DESCENDING, KeyT, OffsetT>,
+                DeviceRadixSortUpsweepKernel<   MaxPolicyT, true,    IS_DESCENDING, KeyT, OffsetT>,
+                RadixSortScanBinsKernel<        MaxPolicyT, OffsetT>,
+                DeviceRadixSortDownsweepKernel< MaxPolicyT, false,   IS_DESCENDING, KeyT, ValueT, OffsetT>,
+                DeviceRadixSortDownsweepKernel< MaxPolicyT, true,    IS_DESCENDING, KeyT, ValueT, OffsetT>);
+        }
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Dispatch entrypoints
+    //------------------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                   d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        OffsetT                 num_items,              ///< [in] Number of items to sort
+        int                     begin_bit,              ///< [in] The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+        bool                    is_overwrite_okay,      ///< [in] Whether is okay to overwrite source buffers
+        cudaStream_t            stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous)      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT;
+
+        cudaError_t error;
+        do {
+            // Get PTX version
+            int ptx_version;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchRadixSort dispatch(
+                d_temp_storage, temp_storage_bytes,
+                d_keys, d_values,
+                num_items, begin_bit, end_bit, is_overwrite_okay,
+                stream, debug_synchronous, ptx_version);
+
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+
+        } while (0);
+
+        return error;
+    }
+};
+
+
+
+
+/******************************************************************************
+ * Segmented dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for segmented device-wide radix sort
+ */
+template <
+    bool     IS_DESCENDING,     ///< Whether or not the sorted-order is high-to-low
+    typename KeyT,              ///< Key type
+    typename ValueT,            ///< Value type
+    typename OffsetIteratorT,   ///< Random-access input iterator type for reading segment offsets \iterator
+    typename OffsetT>           ///< Signed integer type for global offsets
+struct DispatchSegmentedRadixSort :
+    DeviceRadixSortPolicy<KeyT, ValueT, OffsetT>
+{
+    //------------------------------------------------------------------------------
+    // Constants
+    //------------------------------------------------------------------------------
+
+    enum
+    {
+        // Whether this is a keys-only (or key-value) sort
+        KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
+    };
+
+
+    //------------------------------------------------------------------------------
+    // Parameter members
+    //------------------------------------------------------------------------------
+
+    void                    *d_temp_storage;        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t                  &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    DoubleBuffer<KeyT>      &d_keys;                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+    DoubleBuffer<ValueT>    &d_values;              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+    OffsetT                 num_items;              ///< [in] Number of items to sort
+    OffsetT                 num_segments;           ///< [in] The number of segments that comprise the sorting data
+    OffsetIteratorT         d_begin_offsets;        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    OffsetIteratorT         d_end_offsets;          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    int                     begin_bit;              ///< [in] The beginning (least-significant) bit index needed for key comparison
+    int                     end_bit;                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+    cudaStream_t            stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                    debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    int                     ptx_version;            ///< [in] PTX version
+    bool                    is_overwrite_okay;      ///< [in] Whether is okay to overwrite source buffers
+
+
+    //------------------------------------------------------------------------------
+    // Constructors
+    //------------------------------------------------------------------------------
+
+    /// Constructor
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchSegmentedRadixSort(
+        void*                   d_temp_storage,
+        size_t                  &temp_storage_bytes,
+        DoubleBuffer<KeyT>      &d_keys,
+        DoubleBuffer<ValueT>    &d_values,
+        OffsetT                 num_items,
+        OffsetT                 num_segments,
+        OffsetIteratorT         d_begin_offsets,
+        OffsetIteratorT         d_end_offsets,
+        int                     begin_bit,
+        int                     end_bit,
+        bool                    is_overwrite_okay,
+        cudaStream_t            stream,
+        bool                    debug_synchronous,
+        int                     ptx_version)
+    :
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_keys(d_keys),
+        d_values(d_values),
+        num_items(num_items),
+        num_segments(num_segments),
+        d_begin_offsets(d_begin_offsets),
+        d_end_offsets(d_end_offsets),
+        begin_bit(begin_bit),
+        end_bit(end_bit),
+        is_overwrite_okay(is_overwrite_okay),
+        stream(stream),
+        debug_synchronous(debug_synchronous),
+        ptx_version(ptx_version)
+    {}
+
+
+    //------------------------------------------------------------------------------
+    // Multi-segment invocation
+    //------------------------------------------------------------------------------
+
+    /// Invoke a three-kernel sorting pass at the current bit.
+    template <typename PassConfigT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePass(
+        const KeyT      *d_keys_in,
+        KeyT            *d_keys_out,
+        const ValueT    *d_values_in,
+        ValueT          *d_values_out,
+        int             &current_bit,
+        PassConfigT     &pass_config)
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
+
+            // Log kernel configuration
+            if (debug_synchronous)
+                _CubLog("Invoking segmented_kernels<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
+                    num_segments, pass_config.segmented_config.block_threads, (long long) stream,
+                pass_config.segmented_config.items_per_thread, pass_config.segmented_config.sm_occupancy, current_bit, pass_bits);
+
+            pass_config.segmented_kernel<<<num_segments, pass_config.segmented_config.block_threads, 0, stream>>>(
+                d_keys_in, d_keys_out,
+                d_values_in,  d_values_out,
+                d_begin_offsets, d_end_offsets, num_segments,
+                current_bit, pass_bits);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Update current bit
+            current_bit += pass_bits;
+        }
+        while (0);
+
+        return error;
+    }
+
+
+    /// PassConfig data structure
+    template <typename SegmentedKernelT>
+    struct PassConfig
+    {
+        SegmentedKernelT    segmented_kernel;
+        KernelConfig        segmented_config;
+        int                 radix_bits;
+        int                 radix_digits;
+
+        /// Initialize pass configuration
+        template <typename SegmentedPolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        cudaError_t InitPassConfig(SegmentedKernelT segmented_kernel)
+        {
+            this->segmented_kernel  = segmented_kernel;
+            this->radix_bits        = SegmentedPolicyT::RADIX_BITS;
+            this->radix_digits      = 1 << radix_bits;
+
+            return CubDebug(segmented_config.Init<SegmentedPolicyT>(segmented_kernel));
+        }
+    };
+
+
+    /// Invocation (run multiple digit passes)
+    template <
+        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
+        typename                SegmentedKernelT>       ///< Function type of cub::DeviceSegmentedRadixSortKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePasses(
+        SegmentedKernelT     segmented_kernel,          ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
+        SegmentedKernelT     alt_segmented_kernel)      ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+      (void)segmented_kernel;
+      (void)alt_segmented_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Init regular and alternate kernel configurations
+            PassConfig<SegmentedKernelT> pass_config, alt_pass_config;
+            if ((error = pass_config.template       InitPassConfig<typename ActivePolicyT::SegmentedPolicy>(segmented_kernel))) break;
+            if ((error = alt_pass_config.template   InitPassConfig<typename ActivePolicyT::AltSegmentedPolicy>(alt_segmented_kernel))) break;
+
+            // Temporary storage allocation requirements
+            void* allocations[2];
+            size_t allocation_sizes[2] =
+            {
+                (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),                      // bytes needed for 3rd keys buffer
+                (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),     // bytes needed for 3rd values buffer
+            };
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+            {
+                if (temp_storage_bytes == 0)
+                    temp_storage_bytes = 1;
+                return cudaSuccess;
+            }
+
+            // Pass planning.  Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
+            int radix_bits          = ActivePolicyT::SegmentedPolicy::RADIX_BITS;
+            int alt_radix_bits      = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS;
+            int num_bits            = end_bit - begin_bit;
+            int num_passes          = (num_bits + radix_bits - 1) / radix_bits;
+            bool is_num_passes_odd  = num_passes & 1;
+            int max_alt_passes      = (num_passes * radix_bits) - num_bits;
+            int alt_end_bit         = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_radix_bits));
+
+            DoubleBuffer<KeyT> d_keys_remaining_passes(
+                (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[0]),
+                (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[0]) : d_keys.Alternate());
+
+            DoubleBuffer<ValueT> d_values_remaining_passes(
+                (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[1]),
+                (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[1]) : d_values.Alternate());
+
+            // Run first pass, consuming from the input's current buffers
+            int current_bit = begin_bit;
+
+            if (CubDebug(error = InvokePass(
+                d_keys.Current(), d_keys_remaining_passes.Current(),
+                d_values.Current(), d_values_remaining_passes.Current(),
+                current_bit,
+                (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
+
+            // Run remaining passes
+            while (current_bit < end_bit)
+            {
+                if (CubDebug(error = InvokePass(
+                    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                    d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                    current_bit,
+                    (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
+
+                // Invert selectors and update current bit
+                d_keys_remaining_passes.selector ^= 1;
+                d_values_remaining_passes.selector ^= 1;
+            }
+
+            // Update selector
+            if (!is_overwrite_okay) {
+                num_passes = 1; // Sorted data always ends up in the other vector
+            }
+
+            d_keys.selector = (d_keys.selector + num_passes) & 1;
+            d_values.selector = (d_values.selector + num_passes) & 1;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Chained policy invocation
+    //------------------------------------------------------------------------------
+
+    /// Invocation
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;
+
+        // Force kernel code-generation in all compiler passes
+        return InvokePasses<ActivePolicyT>(
+            DeviceSegmentedRadixSortKernel<MaxPolicyT, false,   IS_DESCENDING, KeyT, ValueT, OffsetIteratorT, OffsetT>,
+            DeviceSegmentedRadixSortKernel<MaxPolicyT, true,    IS_DESCENDING, KeyT, ValueT, OffsetIteratorT, OffsetT>);
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Dispatch entrypoints
+    //------------------------------------------------------------------------------
+
+
+    /// Internal dispatch routine
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                   d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,              ///< [in] Number of items to sort
+        int                     num_segments,           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT         d_begin_offsets,        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT         d_end_offsets,          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                     begin_bit,              ///< [in] The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+        bool                    is_overwrite_okay,      ///< [in] Whether is okay to overwrite source buffers
+        cudaStream_t            stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous)      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;
+
+        cudaError_t error;
+        do {
+            // Get PTX version
+            int ptx_version;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchSegmentedRadixSort dispatch(
+                d_temp_storage, temp_storage_bytes,
+                d_keys, d_values,
+                num_items, num_segments, d_begin_offsets, d_end_offsets,
+                begin_bit, end_bit, is_overwrite_okay,
+                stream, debug_synchronous, ptx_version);
+
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+
+        } while (0);
+
+        return error;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_reduce.cuh b/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_reduce.cuh
new file mode 100644
index 00000000000..b6aa44cc0e5
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_reduce.cuh
@@ -0,0 +1,882 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../agent/agent_reduce.cuh"
+#include "../../iterator/arg_index_input_iterator.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_even_share.cuh"
+#include "../../iterator/arg_index_input_iterator.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Reduce region kernel entry point (multi-block).  Computes privatized reductions, one per thread block.
+ */
+template <
+    typename                ChainedPolicyT,             ///< Chained tuning policy
+    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
+    typename                OffsetT,                    ///< Signed integer type for global offsets
+    typename                ReductionOpT>               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
+__global__ void DeviceReduceKernel(
+    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
+    OffsetT                 num_items,                  ///< [in] Total number of input data items
+    GridEvenShare<OffsetT>  even_share,                 ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
+    ReductionOpT            reduction_op)               ///< [in] Binary reduction functor
+{
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // Thread block type for reducing input tiles
+    typedef AgentReduce<
+            typename ChainedPolicyT::ActivePolicy::ReducePolicy,
+            InputIteratorT,
+            OutputIteratorT,
+            OffsetT,
+            ReductionOpT>
+        AgentReduceT;
+
+    // Shared memory storage
+    __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+    // Consume input tiles
+    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeTiles(even_share);
+
+    // Output result
+    if (threadIdx.x == 0)
+        d_out[blockIdx.x] = block_aggregate;
+}
+
+
+/**
+ * Reduce a single tile kernel entry point (single-block).  Can be used to aggregate privatized thread block reductions from a previous multi-block reduction pass.
+ */
+template <
+    typename                ChainedPolicyT,             ///< Chained tuning policy
+    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
+    typename                OffsetT,                    ///< Signed integer type for global offsets
+    typename                ReductionOpT,               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename                OuputT>                     ///< Data element type that is convertible to the \p value type of \p OutputIteratorT
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
+__global__ void DeviceReduceSingleTileKernel(
+    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
+    OffsetT                 num_items,                  ///< [in] Total number of input data items
+    ReductionOpT            reduction_op,               ///< [in] Binary reduction functor
+    OuputT                  init)                       ///< [in] The initial value of the reduction
+{
+    // Thread block type for reducing input tiles
+    typedef AgentReduce<
+            typename ChainedPolicyT::ActivePolicy::SingleTilePolicy,
+            InputIteratorT,
+            OutputIteratorT,
+            OffsetT,
+            ReductionOpT>
+        AgentReduceT;
+
+    // Shared memory storage
+    __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+    // Check if empty problem
+    if (num_items == 0)
+    {
+        if (threadIdx.x == 0)
+            *d_out = init;
+        return;
+    }
+
+    // Consume input tiles
+    OuputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
+        OffsetT(0),
+        num_items);
+
+    // Output result
+    if (threadIdx.x == 0)
+        *d_out = reduction_op(init, block_aggregate);
+}
+
+
+/// Normalize input iterator to segment offset
+template <typename T, typename OffsetT, typename IteratorT>
+__device__ __forceinline__
+void NormalizeReductionOutput(
+    T &/*val*/,
+    OffsetT /*base_offset*/,
+    IteratorT /*itr*/)
+{}
+
+
+/// Normalize input iterator to segment offset (specialized for arg-index)
+template <typename KeyValuePairT, typename OffsetT, typename WrappedIteratorT, typename OutputValueT>
+__device__ __forceinline__
+void NormalizeReductionOutput(
+    KeyValuePairT &val,
+    OffsetT base_offset,
+    ArgIndexInputIterator<WrappedIteratorT, OffsetT, OutputValueT> /*itr*/)
+{
+    val.key -= base_offset;
+}
+
+
+/**
+ * Segmented reduction (one block per segment)
+ */
+template <
+    typename                ChainedPolicyT,             ///< Chained tuning policy
+    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
+    typename                OffsetIteratorT,            ///< Random-access input iterator type for reading segment offsets \iterator
+    typename                OffsetT,                    ///< Signed integer type for global offsets
+    typename                ReductionOpT,               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename                OutputT>                    ///< Data element type that is convertible to the \p value type of \p OutputIteratorT
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
+__global__ void DeviceSegmentedReduceKernel(
+    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
+    OffsetIteratorT         d_begin_offsets,            ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    OffsetIteratorT         d_end_offsets,              ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    int                     /*num_segments*/,           ///< [in] The number of segments that comprise the sorting data
+    ReductionOpT            reduction_op,               ///< [in] Binary reduction functor 
+    OutputT                 init)                       ///< [in] The initial value of the reduction
+{
+    // Thread block type for reducing input tiles
+    typedef AgentReduce<
+            typename ChainedPolicyT::ActivePolicy::ReducePolicy,
+            InputIteratorT,
+            OutputIteratorT,
+            OffsetT,
+            ReductionOpT>
+        AgentReduceT;
+
+    // Shared memory storage
+    __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+    OffsetT segment_begin   = d_begin_offsets[blockIdx.x];
+    OffsetT segment_end     = d_end_offsets[blockIdx.x];
+
+    // Check if empty problem
+    if (segment_begin == segment_end)
+    {
+        if (threadIdx.x == 0)
+            d_out[blockIdx.x] = init;
+        return;
+    }
+
+    // Consume input tiles
+    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
+        segment_begin,
+        segment_end);
+
+    // Normalize as needed
+    NormalizeReductionOutput(block_aggregate, segment_begin, d_in);
+
+    if (threadIdx.x == 0)
+        d_out[blockIdx.x] = reduction_op(init, block_aggregate);;
+}
+
+
+
+
+/******************************************************************************
+ * Policy
+ ******************************************************************************/
+
+template <
+    typename OuputT,            ///< Data type
+    typename OffsetT,           ///< Signed integer type for global offsets
+    typename ReductionOpT>      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
+struct DeviceReducePolicy
+{
+    //------------------------------------------------------------------------------
+    // Architecture-specific tuning policies
+    //------------------------------------------------------------------------------
+
+    /// SM13
+    struct Policy130 : ChainedPolicy<130, Policy130, Policy130>
+    {
+        // ReducePolicy
+        typedef AgentReducePolicy<
+                CUB_NOMINAL_CONFIG(128, 8, OuputT), ///< Threads per block, items per thread
+                2,                                  ///< Number of items per vectorized load
+                BLOCK_REDUCE_RAKING,                ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT>                       ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+
+    /// SM20
+    struct Policy200 : ChainedPolicy<200, Policy200, Policy130>
+    {
+        // ReducePolicy (GTX 580: 178.9 GB/s @ 48M 4B items, 158.1 GB/s @ 192M 1B items)
+        typedef AgentReducePolicy<
+                CUB_NOMINAL_CONFIG(128, 8, OuputT),     ///< Threads per block, items per thread
+                4,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_RAKING,                    ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT>                           ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+
+    /// SM30
+    struct Policy300 : ChainedPolicy<300, Policy300, Policy200>
+    {
+        // ReducePolicy (GTX670: 154.0 @ 48M 4B items)
+        typedef AgentReducePolicy<
+                CUB_NOMINAL_CONFIG(256, 20, OuputT),    ///< Threads per block, items per thread
+                2,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT>                           ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+
+    /// SM35
+    struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
+    {
+        // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B items)
+        typedef AgentReducePolicy<
+                CUB_NOMINAL_CONFIG(256, 20, OuputT),    ///< Threads per block, items per thread
+                4,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
+                LOAD_LDG>                               ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+    /// SM60
+    struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
+    {
+        // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items)
+        typedef AgentReducePolicy<
+                CUB_NOMINAL_CONFIG(256, 16, OuputT),    ///< Threads per block, items per thread
+                4,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
+                LOAD_LDG>                               ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+
+    /// MaxPolicy
+    typedef Policy600 MaxPolicy;
+
+};
+
+
+
+/******************************************************************************
+ * Single-problem dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction
+ */
+template <
+    typename InputIteratorT,    ///< Random-access input iterator type for reading input items \iterator
+    typename OutputIteratorT,   ///< Output iterator type for recording the reduced aggregate \iterator
+    typename OffsetT,           ///< Signed integer type for global offsets
+    typename ReductionOpT>      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
+struct DispatchReduce :
+    DeviceReducePolicy<
+        typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            typename std::iterator_traits<InputIteratorT>::value_type,                                  // ... then the input iterator's value type,
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type,                          // ... else the output iterator's value type
+        OffsetT,
+        ReductionOpT>
+{
+    //------------------------------------------------------------------------------
+    // Constants
+    //------------------------------------------------------------------------------
+
+    // Data type of output iterator
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+
+    //------------------------------------------------------------------------------
+    // Problem state
+    //------------------------------------------------------------------------------
+
+    void                *d_temp_storage;                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t              &temp_storage_bytes;            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    InputIteratorT      d_in;                           ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT     d_out;                          ///< [out] Pointer to the output aggregate
+    OffsetT             num_items;                      ///< [in] Total number of input items (i.e., length of \p d_in)
+    ReductionOpT        reduction_op;                   ///< [in] Binary reduction functor 
+    OutputT             init;                           ///< [in] The initial value of the reduction
+    cudaStream_t        stream;                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                debug_synchronous;              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    int                 ptx_version;                    ///< [in] PTX version
+
+    //------------------------------------------------------------------------------
+    // Constructor
+    //------------------------------------------------------------------------------
+
+    /// Constructor
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchReduce(
+        void*                   d_temp_storage,
+        size_t                  &temp_storage_bytes,
+        InputIteratorT          d_in,
+        OutputIteratorT         d_out,
+        OffsetT                 num_items,
+        ReductionOpT            reduction_op,
+        OutputT                 init,
+        cudaStream_t            stream,
+        bool                    debug_synchronous,
+        int                     ptx_version)
+    :
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_in(d_in),
+        d_out(d_out),
+        num_items(num_items),
+        reduction_op(reduction_op),
+        init(init),
+        stream(stream),
+        debug_synchronous(debug_synchronous),
+        ptx_version(ptx_version)
+    {}
+
+
+    //------------------------------------------------------------------------------
+    // Small-problem (single tile) invocation
+    //------------------------------------------------------------------------------
+
+    /// Invoke a single block block to reduce in-core
+    template <
+        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
+        typename                SingleTileKernelT>      ///< Function type of cub::DeviceReduceSingleTileKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokeSingleTile(
+        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)single_tile_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+            {
+                temp_storage_bytes = 1;
+                break;
+            }
+
+            // Log single_reduce_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
+                ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
+                (long long) stream,
+                ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
+
+            // Invoke single_reduce_sweep_kernel
+            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
+                d_in,
+                d_out,
+                num_items,
+                reduction_op,
+                init);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Normal problem size invocation (two-pass)
+    //------------------------------------------------------------------------------
+
+    /// Invoke two-passes to reduce
+    template <
+        typename                ActivePolicyT,              ///< Umbrella policy active for the target device
+        typename                ReduceKernelT,              ///< Function type of cub::DeviceReduceKernel
+        typename                SingleTileKernelT>          ///< Function type of cub::DeviceReduceSingleTileKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePasses(
+        ReduceKernelT           reduce_kernel,          ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceKernel
+        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)                  reduce_kernel;
+        (void)                  single_tile_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Init regular kernel configuration
+            KernelConfig reduce_config;
+            if (CubDebug(error = reduce_config.Init<typename ActivePolicyT::ReducePolicy>(reduce_kernel))) break;
+            int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count;
+
+            // Even-share work distribution
+            int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(ptx_version);
+            GridEvenShare<OffsetT> even_share;
+            even_share.DispatchInit(num_items, max_blocks, reduce_config.tile_size);
+
+            // Temporary storage allocation requirements
+            void* allocations[1];
+            size_t allocation_sizes[1] =
+            {
+                max_blocks * sizeof(OutputT)    // bytes needed for privatized block reductions
+            };
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                return cudaSuccess;
+            }
+
+            // Alias the allocation for the privatized per-block reductions
+            OutputT *d_block_reductions = (OutputT*) allocations[0];
+
+            // Get grid size for device_reduce_sweep_kernel
+            int reduce_grid_size = even_share.grid_size;
+
+            // Log device_reduce_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                reduce_grid_size,
+                ActivePolicyT::ReducePolicy::BLOCK_THREADS,
+                (long long) stream,
+                ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD,
+                reduce_config.sm_occupancy);
+
+            // Invoke DeviceReduceKernel
+            reduce_kernel<<<reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS, 0, stream>>>(
+                d_in,
+                d_block_reductions,
+                num_items,
+                even_share,
+                reduction_op);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Log single_reduce_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
+                ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
+                (long long) stream,
+                ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
+
+            // Invoke DeviceReduceSingleTileKernel
+            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
+                d_block_reductions,
+                d_out,
+                reduce_grid_size,
+                reduction_op,
+                init);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Chained policy invocation
+    //------------------------------------------------------------------------------
+
+    /// Invocation
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename ActivePolicyT::SingleTilePolicy    SingleTilePolicyT;
+        typedef typename DispatchReduce::MaxPolicy          MaxPolicyT;
+
+        // Force kernel code-generation in all compiler passes
+        if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
+        {
+            // Small, single tile size
+            return InvokeSingleTile<ActivePolicyT>(
+                DeviceReduceSingleTileKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);
+        }
+        else
+        {
+            // Regular size
+            return InvokePasses<ActivePolicyT>(
+                DeviceReduceKernel<typename DispatchReduce::MaxPolicy, InputIteratorT, OutputT*, OffsetT, ReductionOpT>,
+                DeviceReduceSingleTileKernel<MaxPolicyT, OutputT*, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);
+        }
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Dispatch entrypoints
+    //------------------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output aggregate
+        OffsetT         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        ReductionOpT    reduction_op,                       ///< [in] Binary reduction functor 
+        OutputT         init,                               ///< [in] The initial value of the reduction
+        cudaStream_t    stream,                             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous)                  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchReduce::MaxPolicy MaxPolicyT;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchReduce dispatch(
+                d_temp_storage, temp_storage_bytes,
+                d_in, d_out, num_items, reduction_op, init,
+                stream, debug_synchronous, ptx_version);
+
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+
+/******************************************************************************
+ * Segmented dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction
+ */
+template <
+    typename InputIteratorT,    ///< Random-access input iterator type for reading input items \iterator
+    typename OutputIteratorT,   ///< Output iterator type for recording the reduced aggregate \iterator
+    typename OffsetIteratorT,   ///< Random-access input iterator type for reading segment offsets \iterator
+    typename OffsetT,           ///< Signed integer type for global offsets
+    typename ReductionOpT>      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
+struct DispatchSegmentedReduce :
+    DeviceReducePolicy<
+        typename std::iterator_traits<InputIteratorT>::value_type,
+        OffsetT,
+        ReductionOpT>
+{
+    //------------------------------------------------------------------------------
+    // Constants
+    //------------------------------------------------------------------------------
+
+    /// The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+
+    //------------------------------------------------------------------------------
+    // Problem state
+    //------------------------------------------------------------------------------
+
+    void                *d_temp_storage;        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t              &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    InputIteratorT      d_in;                   ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT     d_out;                  ///< [out] Pointer to the output aggregate
+    OffsetT             num_segments;           ///< [in] The number of segments that comprise the sorting data
+    OffsetIteratorT     d_begin_offsets;        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    OffsetIteratorT     d_end_offsets;          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    ReductionOpT        reduction_op;           ///< [in] Binary reduction functor 
+    OutputT             init;                   ///< [in] The initial value of the reduction
+    cudaStream_t        stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    int                 ptx_version;            ///< [in] PTX version
+
+    //------------------------------------------------------------------------------
+    // Constructor
+    //------------------------------------------------------------------------------
+
+    /// Constructor
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchSegmentedReduce(
+        void*                   d_temp_storage,
+        size_t                  &temp_storage_bytes,
+        InputIteratorT          d_in,
+        OutputIteratorT         d_out,
+        OffsetT                 num_segments,
+        OffsetIteratorT         d_begin_offsets,
+        OffsetIteratorT         d_end_offsets,
+        ReductionOpT            reduction_op,
+        OutputT                 init,
+        cudaStream_t            stream,
+        bool                    debug_synchronous,
+        int                     ptx_version)
+    :
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_in(d_in),
+        d_out(d_out),
+        num_segments(num_segments),
+        d_begin_offsets(d_begin_offsets),
+        d_end_offsets(d_end_offsets),
+        reduction_op(reduction_op),
+        init(init),
+        stream(stream),
+        debug_synchronous(debug_synchronous),
+        ptx_version(ptx_version)
+    {}
+
+
+
+    //------------------------------------------------------------------------------
+    // Chained policy invocation
+    //------------------------------------------------------------------------------
+
+    /// Invocation
+    template <
+        typename                        ActivePolicyT,                  ///< Umbrella policy active for the target device
+        typename                        DeviceSegmentedReduceKernelT>   ///< Function type of cub::DeviceSegmentedReduceKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePasses(
+        DeviceSegmentedReduceKernelT    segmented_reduce_kernel)        ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedReduceKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)segmented_reduce_kernel;
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+            {
+                temp_storage_bytes = 1;
+                return cudaSuccess;
+            }
+
+            // Init kernel configuration
+            KernelConfig segmented_reduce_config;
+            if (CubDebug(error = segmented_reduce_config.Init<typename ActivePolicyT::SegmentedReducePolicy>(segmented_reduce_kernel))) break;
+
+            // Log device_reduce_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                num_segments,
+                ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS,
+                (long long) stream,
+                ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD,
+                segmented_reduce_config.sm_occupancy);
+
+            // Invoke DeviceReduceKernel
+            segmented_reduce_kernel<<<num_segments, ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream>>>(
+                d_in,
+                d_out,
+                d_begin_offsets,
+                d_end_offsets,
+                num_segments,
+                reduction_op,
+                init);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+
+    }
+
+
+    /// Invocation
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;
+
+        // Force kernel code-generation in all compiler passes
+        return InvokePasses<ActivePolicyT>(
+            DeviceSegmentedReduceKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, ReductionOpT, OutputT>);
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Dispatch entrypoints
+    //------------------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output aggregate
+        int             num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        ReductionOpT    reduction_op,                       ///< [in] Binary reduction functor 
+        OutputT         init,                               ///< [in] The initial value of the reduction
+        cudaStream_t    stream,                             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous)                  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;
+
+        if (num_segments <= 0)
+            return cudaSuccess;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchSegmentedReduce dispatch(
+                d_temp_storage, temp_storage_bytes,
+                d_in, d_out,
+                num_segments, d_begin_offsets, d_end_offsets,
+                reduction_op, init,
+                stream, debug_synchronous, ptx_version);
+
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_reduce_by_key.cuh b/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_reduce_by_key.cuh
new file mode 100644
index 00000000000..672bc49393a
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_reduce_by_key.cuh
@@ -0,0 +1,554 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReduceByKey provides device-wide, parallel operations for reducing segments of values residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch_scan.cuh"
+#include "../../agent/agent_reduce_by_key.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Multi-block reduce-by-key sweep kernel entry point
+ */
+template <
+    typename            AgentReduceByKeyPolicyT,                 ///< Parameterized AgentReduceByKeyPolicyT tuning policy type
+    typename            KeysInputIteratorT,                     ///< Random-access input iterator type for keys
+    typename            UniqueOutputIteratorT,                  ///< Random-access output iterator type for keys
+    typename            ValuesInputIteratorT,                   ///< Random-access input iterator type for values
+    typename            AggregatesOutputIteratorT,              ///< Random-access output iterator type for values
+    typename            NumRunsOutputIteratorT,                 ///< Output iterator type for recording number of segments encountered
+    typename            ScanTileStateT,                         ///< Tile status interface type
+    typename            EqualityOpT,                            ///< KeyT equality operator type
+    typename            ReductionOpT,                           ///< ValueT reduction operator type
+    typename            OffsetT>                                ///< Signed integer type for global offsets
+__launch_bounds__ (int(AgentReduceByKeyPolicyT::BLOCK_THREADS))
+__global__ void DeviceReduceByKeyKernel(
+    KeysInputIteratorT          d_keys_in,                      ///< Pointer to the input sequence of keys
+    UniqueOutputIteratorT       d_unique_out,                   ///< Pointer to the output sequence of unique keys (one key per run)
+    ValuesInputIteratorT        d_values_in,                    ///< Pointer to the input sequence of corresponding values
+    AggregatesOutputIteratorT   d_aggregates_out,               ///< Pointer to the output sequence of value aggregates (one aggregate per run)
+    NumRunsOutputIteratorT      d_num_runs_out,                 ///< Pointer to total number of runs encountered (i.e., the length of d_unique_out)
+    ScanTileStateT              tile_state,                     ///< Tile status interface
+    int                         start_tile,                     ///< The starting tile for the current grid
+    EqualityOpT                 equality_op,                    ///< KeyT equality operator
+    ReductionOpT                reduction_op,                   ///< ValueT reduction operator
+    OffsetT                     num_items)                      ///< Total number of items to select from
+{
+    // Thread block type for reducing tiles of value segments
+    typedef AgentReduceByKey<
+            AgentReduceByKeyPolicyT,
+            KeysInputIteratorT,
+            UniqueOutputIteratorT,
+            ValuesInputIteratorT,
+            AggregatesOutputIteratorT,
+            NumRunsOutputIteratorT,
+            EqualityOpT,
+            ReductionOpT,
+            OffsetT>
+        AgentReduceByKeyT;
+
+    // Shared memory for AgentReduceByKey
+    __shared__ typename AgentReduceByKeyT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentReduceByKeyT(temp_storage, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, equality_op, reduction_op).ConsumeRange(
+        num_items,
+        tile_state,
+        start_tile);
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceReduceByKey
+ */
+template <
+    typename    KeysInputIteratorT,         ///< Random-access input iterator type for keys
+    typename    UniqueOutputIteratorT,      ///< Random-access output iterator type for keys
+    typename    ValuesInputIteratorT,       ///< Random-access input iterator type for values
+    typename    AggregatesOutputIteratorT,  ///< Random-access output iterator type for values
+    typename    NumRunsOutputIteratorT,     ///< Output iterator type for recording number of segments encountered
+    typename    EqualityOpT,                ///< KeyT equality operator type
+    typename    ReductionOpT,               ///< ValueT reduction operator type
+    typename    OffsetT>                    ///< Signed integer type for global offsets
+struct DispatchReduceByKey
+{
+    //-------------------------------------------------------------------------
+    // Types and constants
+    //-------------------------------------------------------------------------
+
+    // The input keys type
+    typedef typename std::iterator_traits<KeysInputIteratorT>::value_type KeyInputT;
+
+    // The output keys type
+    typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE),    // KeyOutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<KeysInputIteratorT>::value_type,                                              // ... then the input iterator's value type,
+        typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type KeyOutputT;                         // ... else the output iterator's value type
+
+    // The input values type
+    typedef typename std::iterator_traits<ValuesInputIteratorT>::value_type ValueInputT;
+
+    // The output values type
+    typedef typename If<(Equals<typename std::iterator_traits<AggregatesOutputIteratorT>::value_type, void>::VALUE),    // ValueOutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<ValuesInputIteratorT>::value_type,                                                // ... then the input iterator's value type,
+        typename std::iterator_traits<AggregatesOutputIteratorT>::value_type>::Type ValueOutputT;                       // ... else the output iterator's value type
+
+    enum
+    {
+        INIT_KERNEL_THREADS     = 128,
+        MAX_INPUT_BYTES         = CUB_MAX(sizeof(KeyOutputT), sizeof(ValueOutputT)),
+        COMBINED_INPUT_BYTES    = sizeof(KeyOutputT) + sizeof(ValueOutputT),
+    };
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<ValueOutputT, OffsetT> ScanTileStateT;
+
+
+    //-------------------------------------------------------------------------
+    // Tuning policies
+    //-------------------------------------------------------------------------
+
+    /// SM35
+    struct Policy350
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 6,
+            ITEMS_PER_THREAD            = (MAX_INPUT_BYTES <= 8) ? 6 : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            ReduceByKeyPolicyT;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 6,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            ReduceByKeyPolicyT;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 11,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            ReduceByKeyPolicyT;
+    };
+
+    /// SM13
+    struct Policy130
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 7,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            ReduceByKeyPolicyT;
+    };
+
+    /// SM11
+    struct Policy110
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 5,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 8) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                64,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_RAKING>
+            ReduceByKeyPolicyT;
+    };
+
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+    typedef Policy130 PtxPolicy;
+
+#else
+    typedef Policy110 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxReduceByKeyPolicy : PtxPolicy::ReduceByKeyPolicyT {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &reduce_by_key_config)
+    {
+    #if (CUB_PTX_ARCH > 0)
+        (void)ptx_version;
+
+        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+        reduce_by_key_config.template Init<PtxReduceByKeyPolicy>();
+
+    #else
+
+        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+        if (ptx_version >= 350)
+        {
+            reduce_by_key_config.template Init<typename Policy350::ReduceByKeyPolicyT>();
+        }
+        else if (ptx_version >= 300)
+        {
+            reduce_by_key_config.template Init<typename Policy300::ReduceByKeyPolicyT>();
+        }
+        else if (ptx_version >= 200)
+        {
+            reduce_by_key_config.template Init<typename Policy200::ReduceByKeyPolicyT>();
+        }
+        else if (ptx_version >= 130)
+        {
+            reduce_by_key_config.template Init<typename Policy130::ReduceByKeyPolicyT>();
+        }
+        else
+        {
+            reduce_by_key_config.template Init<typename Policy110::ReduceByKeyPolicyT>();
+        }
+
+    #endif
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.
+     */
+    struct KernelConfig
+    {
+        int block_threads;
+        int items_per_thread;
+        int tile_items;
+
+        template <typename PolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads       = PolicyT::BLOCK_THREADS;
+            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
+            tile_items          = block_threads * items_per_thread;
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Dispatch entrypoints
+    //---------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduce-by-key using the
+     * specified kernel functions.
+     */
+    template <
+        typename                    ScanInitKernelT,         ///< Function type of cub::DeviceScanInitKernel
+        typename                    ReduceByKeyKernelT>      ///< Function type of cub::DeviceReduceByKeyKernelT
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeysInputIteratorT          d_keys_in,                  ///< [in] Pointer to the input sequence of keys
+        UniqueOutputIteratorT       d_unique_out,               ///< [out] Pointer to the output sequence of unique keys (one key per run)
+        ValuesInputIteratorT        d_values_in,                ///< [in] Pointer to the input sequence of corresponding values
+        AggregatesOutputIteratorT   d_aggregates_out,           ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
+        NumRunsOutputIteratorT      d_num_runs_out,             ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
+        EqualityOpT                 equality_op,                ///< [in] KeyT equality operator
+        ReductionOpT                reduction_op,               ///< [in] ValueT reduction operator
+        OffsetT                     num_items,                  ///< [in] Total number of items to select from
+        cudaStream_t                stream,                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous,          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                         /*ptx_version*/,            ///< [in] PTX version of dispatch kernels
+        ScanInitKernelT                init_kernel,                ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
+        ReduceByKeyKernelT             reduce_by_key_kernel,       ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceByKeyKernel
+        KernelConfig                reduce_by_key_config)       ///< [in] Dispatch parameters that match the policy that \p reduce_by_key_kernel was compiled for
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+      (void)d_temp_storage;
+      (void)temp_storage_bytes;
+      (void)d_keys_in;
+      (void)d_unique_out;
+      (void)d_values_in;
+      (void)d_aggregates_out;
+      (void)d_num_runs_out;
+      (void)equality_op;
+      (void)reduction_op;
+      (void)num_items;
+      (void)stream;
+      (void)debug_synchronous;
+      (void)init_kernel;
+      (void)reduce_by_key_kernel;
+      (void)reduce_by_key_config;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = reduce_by_key_config.block_threads * reduce_by_key_config.items_per_thread;
+            int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[1];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void* allocations[1];
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the tile status interface
+            ScanTileStateT tile_state;
+            if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log init_kernel configuration
+            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
+            if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke init_kernel to initialize tile descriptors
+            init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
+                tile_state,
+                num_tiles,
+                d_num_runs_out);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Get SM occupancy for reduce_by_key_kernel
+            int reduce_by_key_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                reduce_by_key_sm_occupancy,            // out
+                reduce_by_key_kernel,
+                reduce_by_key_config.block_threads))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Run grids in epochs (in case number of tiles exceeds max x-dimension
+            int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
+            for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
+            {
+                // Log reduce_by_key_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                    start_tile, scan_grid_size, reduce_by_key_config.block_threads, (long long) stream, reduce_by_key_config.items_per_thread, reduce_by_key_sm_occupancy);
+
+                // Invoke reduce_by_key_kernel
+                reduce_by_key_kernel<<<scan_grid_size, reduce_by_key_config.block_threads, 0, stream>>>(
+                    d_keys_in,
+                    d_unique_out,
+                    d_values_in,
+                    d_aggregates_out,
+                    d_num_runs_out,
+                    tile_state,
+                    start_tile,
+                    equality_op,
+                    reduction_op,
+                    num_items);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeysInputIteratorT          d_keys_in,                      ///< [in] Pointer to the input sequence of keys
+        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
+        ValuesInputIteratorT        d_values_in,                    ///< [in] Pointer to the input sequence of corresponding values
+        AggregatesOutputIteratorT   d_aggregates_out,               ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
+        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
+        EqualityOpT                 equality_op,                    ///< [in] KeyT equality operator
+        ReductionOpT                reduction_op,                   ///< [in] ValueT reduction operator
+        OffsetT                     num_items,                      ///< [in] Total number of items to select from
+        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous)              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig reduce_by_key_config;
+            InitConfigs(ptx_version, reduce_by_key_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_keys_in,
+                d_unique_out,
+                d_values_in,
+                d_aggregates_out,
+                d_num_runs_out,
+                equality_op,
+                reduction_op,
+                num_items,
+                stream,
+                debug_synchronous,
+                ptx_version,
+                DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
+                DeviceReduceByKeyKernel<PtxReduceByKeyPolicy, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, ReductionOpT, OffsetT>,
+                reduce_by_key_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_rle.cuh b/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_rle.cuh
new file mode 100644
index 00000000000..1de979e88cd
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_rle.cuh
@@ -0,0 +1,538 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRle provides device-wide, parallel operations for run-length-encoding sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch_scan.cuh"
+#include "../../agent/agent_rle.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Select kernel entry point (multi-block)
+ *
+ * Performs functor-based selection if SelectOp functor type != NullType
+ * Otherwise performs flag-based selection if FlagIterator's value type != NullType
+ * Otherwise performs discontinuity selection (keep unique)
+ */
+template <
+    typename            AgentRlePolicyT,        ///< Parameterized AgentRlePolicyT tuning policy type
+    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename            OffsetsOutputIteratorT,     ///< Random-access output iterator type for writing run-offset values \iterator
+    typename            LengthsOutputIteratorT,     ///< Random-access output iterator type for writing run-length values \iterator
+    typename            NumRunsOutputIteratorT,     ///< Output iterator type for recording the number of runs encountered \iterator
+    typename            ScanTileStateT,              ///< Tile status interface type
+    typename            EqualityOpT,                 ///< T equality operator type
+    typename            OffsetT>                    ///< Signed integer type for global offsets
+__launch_bounds__ (int(AgentRlePolicyT::BLOCK_THREADS))
+__global__ void DeviceRleSweepKernel(
+    InputIteratorT              d_in,               ///< [in] Pointer to input sequence of data items
+    OffsetsOutputIteratorT      d_offsets_out,      ///< [out] Pointer to output sequence of run-offsets
+    LengthsOutputIteratorT      d_lengths_out,      ///< [out] Pointer to output sequence of run-lengths
+    NumRunsOutputIteratorT      d_num_runs_out,     ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
+    ScanTileStateT              tile_status,        ///< [in] Tile status interface
+    EqualityOpT                 equality_op,        ///< [in] Equality operator for input items
+    OffsetT                     num_items,          ///< [in] Total number of input items (i.e., length of \p d_in)
+    int                         num_tiles)          ///< [in] Total number of tiles for the entire problem
+{
+    // Thread block type for selecting data from input tiles
+    typedef AgentRle<
+        AgentRlePolicyT,
+        InputIteratorT,
+        OffsetsOutputIteratorT,
+        LengthsOutputIteratorT,
+        EqualityOpT,
+        OffsetT> AgentRleT;
+
+    // Shared memory for AgentRle
+    __shared__ typename AgentRleT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentRleT(temp_storage, d_in, d_offsets_out, d_lengths_out, equality_op, num_items).ConsumeRange(
+        num_tiles,
+        tile_status,
+        d_num_runs_out);
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceRle
+ */
+template <
+    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename            OffsetsOutputIteratorT,     ///< Random-access output iterator type for writing run-offset values \iterator
+    typename            LengthsOutputIteratorT,     ///< Random-access output iterator type for writing run-length values \iterator
+    typename            NumRunsOutputIteratorT,     ///< Output iterator type for recording the number of runs encountered \iterator
+    typename            EqualityOpT,                ///< T equality operator type
+    typename            OffsetT>                    ///< Signed integer type for global offsets
+struct DeviceRleDispatch
+{
+    /******************************************************************************
+     * Types and constants
+     ******************************************************************************/
+
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type T;
+
+    // The lengths output value type
+    typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
+        OffsetT,                                                                                                    // ... then the OffsetT type,
+        typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128,
+    };
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<LengthT, OffsetT> ScanTileStateT;
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+    /// SM35
+    struct Policy350
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 15,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                96,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                true,
+                BLOCK_SCAN_WARP_SCANS>
+            RleSweepPolicy;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 5,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                256,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            RleSweepPolicy;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 15,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_WARP_SCANS>
+            RleSweepPolicy;
+    };
+
+    /// SM13
+    struct Policy130
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                64,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            RleSweepPolicy;
+    };
+
+    /// SM10
+    struct Policy100
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                256,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            RleSweepPolicy;
+    };
+
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+    typedef Policy130 PtxPolicy;
+
+#else
+    typedef Policy100 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxRleSweepPolicy : PtxPolicy::RleSweepPolicy {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig&   device_rle_config)
+    {
+    #if (CUB_PTX_ARCH > 0)
+
+        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+        device_rle_config.template Init<PtxRleSweepPolicy>();
+
+    #else
+
+        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+        if (ptx_version >= 350)
+        {
+            device_rle_config.template Init<typename Policy350::RleSweepPolicy>();
+        }
+        else if (ptx_version >= 300)
+        {
+            device_rle_config.template Init<typename Policy300::RleSweepPolicy>();
+        }
+        else if (ptx_version >= 200)
+        {
+            device_rle_config.template Init<typename Policy200::RleSweepPolicy>();
+        }
+        else if (ptx_version >= 130)
+        {
+            device_rle_config.template Init<typename Policy130::RleSweepPolicy>();
+        }
+        else
+        {
+            device_rle_config.template Init<typename Policy100::RleSweepPolicy>();
+        }
+
+    #endif
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.  Mirrors the constants within AgentRlePolicyT.
+     */
+    struct KernelConfig
+    {
+        int                     block_threads;
+        int                     items_per_thread;
+        BlockLoadAlgorithm      load_policy;
+        bool                    store_warp_time_slicing;
+        BlockScanAlgorithm      scan_algorithm;
+
+        template <typename AgentRlePolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads               = AgentRlePolicyT::BLOCK_THREADS;
+            items_per_thread            = AgentRlePolicyT::ITEMS_PER_THREAD;
+            load_policy                 = AgentRlePolicyT::LOAD_ALGORITHM;
+            store_warp_time_slicing     = AgentRlePolicyT::STORE_WARP_TIME_SLICING;
+            scan_algorithm              = AgentRlePolicyT::SCAN_ALGORITHM;
+        }
+
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Print()
+        {
+            printf("%d, %d, %d, %d, %d",
+                block_threads,
+                items_per_thread,
+                load_policy,
+                store_warp_time_slicing,
+                scan_algorithm);
+        }
+    };
+
+
+    /******************************************************************************
+     * Dispatch entrypoints
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine for computing a device-wide run-length-encode using the
+     * specified kernel functions.
+     */
+    template <
+        typename                    DeviceScanInitKernelPtr,        ///< Function type of cub::DeviceScanInitKernel
+        typename                    DeviceRleSweepKernelPtr>        ///< Function type of cub::DeviceRleSweepKernelPtr
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        OffsetsOutputIteratorT      d_offsets_out,                  ///< [out] Pointer to the output sequence of run-offsets
+        LengthsOutputIteratorT      d_lengths_out,                  ///< [out] Pointer to the output sequence of run-lengths
+        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to the total number of runs encountered (i.e., length of \p d_offsets_out)
+        EqualityOpT                 equality_op,                    ///< [in] Equality operator for input items
+        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                         ptx_version,                    ///< [in] PTX version of dispatch kernels
+        DeviceScanInitKernelPtr     device_scan_init_kernel,        ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
+        DeviceRleSweepKernelPtr     device_rle_sweep_kernel,        ///< [in] Kernel function pointer to parameterization of cub::DeviceRleSweepKernel
+        KernelConfig                device_rle_config)              ///< [in] Dispatch parameters that match the policy that \p device_rle_sweep_kernel was compiled for
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = device_rle_config.block_threads * device_rle_config.items_per_thread;
+            int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[1];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void* allocations[1];
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the tile status interface
+            ScanTileStateT tile_status;
+            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log device_scan_init_kernel configuration
+            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
+            if (debug_synchronous) _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors
+            device_scan_init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
+                tile_status,
+                num_tiles,
+                d_num_runs_out);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Get SM occupancy for device_rle_sweep_kernel
+            int device_rle_kernel_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                device_rle_kernel_sm_occupancy,            // out
+                device_rle_sweep_kernel,
+                device_rle_config.block_threads))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Get grid size for scanning tiles
+            dim3 scan_grid_size;
+            scan_grid_size.z = 1;
+            scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
+            scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
+
+            // Log device_rle_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking device_rle_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, device_rle_config.block_threads, (long long) stream, device_rle_config.items_per_thread, device_rle_kernel_sm_occupancy);
+
+            // Invoke device_rle_sweep_kernel
+            device_rle_sweep_kernel<<<scan_grid_size, device_rle_config.block_threads, 0, stream>>>(
+                d_in,
+                d_offsets_out,
+                d_lengths_out,
+                d_num_runs_out,
+                tile_status,
+                equality_op,
+                num_items,
+                num_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to input sequence of data items
+        OffsetsOutputIteratorT      d_offsets_out,                  ///< [out] Pointer to output sequence of run-offsets
+        LengthsOutputIteratorT      d_lengths_out,                  ///< [out] Pointer to output sequence of run-lengths
+        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
+        EqualityOpT                 equality_op,                    ///< [in] Equality operator for input items
+        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous)              ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig device_rle_config;
+            InitConfigs(ptx_version, device_rle_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_in,
+                d_offsets_out,
+                d_lengths_out,
+                d_num_runs_out,
+                equality_op,
+                num_items,
+                stream,
+                debug_synchronous,
+                ptx_version,
+                DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
+                DeviceRleSweepKernel<PtxRleSweepPolicy, InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, OffsetT>,
+                device_rle_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_scan.cuh b/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_scan.cuh
new file mode 100644
index 00000000000..8944dcd33e0
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_scan.cuh
@@ -0,0 +1,563 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../agent/agent_scan.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_arch.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Initialization kernel for tile status initialization (multi-block)
+ */
+template <
+    typename            ScanTileStateT>     ///< Tile status interface type
+__global__ void DeviceScanInitKernel(
+    ScanTileStateT      tile_state,         ///< [in] Tile status interface
+    int                 num_tiles)          ///< [in] Number of tiles
+{
+    // Initialize tile status
+    tile_state.InitializeStatus(num_tiles);
+}
+
+/**
+ * Initialization kernel for tile status initialization (multi-block)
+ */
+template <
+    typename                ScanTileStateT,         ///< Tile status interface type
+    typename                NumSelectedIteratorT>   ///< Output iterator type for recording the number of items selected
+__global__ void DeviceCompactInitKernel(
+    ScanTileStateT          tile_state,             ///< [in] Tile status interface
+    int                     num_tiles,              ///< [in] Number of tiles
+    NumSelectedIteratorT    d_num_selected_out)     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
+{
+    // Initialize tile status
+    tile_state.InitializeStatus(num_tiles);
+
+    // Initialize d_num_selected_out
+    if ((blockIdx.x == 0) && (threadIdx.x == 0))
+        *d_num_selected_out = 0;
+}
+
+
+/**
+ * Scan kernel entry point (multi-block)
+ */
+template <
+    typename            ScanPolicyT,        ///< Parameterized ScanPolicyT tuning policy type
+    typename            InputIteratorT,     ///< Random-access input iterator type for reading scan inputs \iterator
+    typename            OutputIteratorT,    ///< Random-access output iterator type for writing scan outputs \iterator
+    typename            ScanTileStateT,     ///< Tile status interface type
+    typename            ScanOpT,            ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename            InitValueT,         ///< Initial value to seed the exclusive scan (cub::NullType for inclusive scans)
+    typename            OffsetT>            ///< Signed integer type for global offsets
+__launch_bounds__ (int(ScanPolicyT::BLOCK_THREADS))
+__global__ void DeviceScanKernel(
+    InputIteratorT      d_in,               ///< Input data
+    OutputIteratorT     d_out,              ///< Output data
+    ScanTileStateT      tile_state,         ///< Tile status interface
+    int                 start_tile,         ///< The starting tile for the current grid
+    ScanOpT             scan_op,            ///< Binary scan functor 
+    InitValueT          init_value,         ///< Initial value to seed the exclusive scan
+    OffsetT             num_items)          ///< Total number of scan items for the entire problem
+{
+    // Thread block type for scanning input tiles
+    typedef AgentScan<
+        ScanPolicyT,
+        InputIteratorT,
+        OutputIteratorT,
+        ScanOpT,
+        InitValueT,
+        OffsetT> AgentScanT;
+
+    // Shared memory for AgentScan
+    __shared__ typename AgentScanT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentScanT(temp_storage, d_in, d_out, scan_op, init_value).ConsumeRange(
+        num_items,
+        tile_state,
+        start_tile);
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceScan
+ */
+template <
+    typename InputIteratorT,     ///< Random-access input iterator type for reading scan inputs \iterator
+    typename OutputIteratorT,    ///< Random-access output iterator type for writing scan outputs \iterator
+    typename ScanOpT,            ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename InitValueT,          ///< The init_value element type for ScanOpT (cub::NullType for inclusive scans)
+    typename OffsetT>            ///< Signed integer type for global offsets
+struct DispatchScan
+{
+    //---------------------------------------------------------------------
+    // Constants and Types
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128
+    };
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // Tile status descriptor interface type
+    typedef ScanTileState<OutputT> ScanTileStateT;
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies
+    //---------------------------------------------------------------------
+
+    /// SM600
+    struct Policy600
+    {
+        typedef AgentScanPolicy<
+            CUB_NOMINAL_CONFIG(128, 15, OutputT),      ///< Threads per block, items per thread
+                BLOCK_LOAD_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+
+    /// SM520
+    struct Policy520
+    {
+        // Titan X: 32.47B items/s @ 48M 32-bit T
+        typedef AgentScanPolicy<
+                CUB_NOMINAL_CONFIG(128, 12, OutputT),      ///< Threads per block, items per thread
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+
+    /// SM35
+    struct Policy350
+    {
+        // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T
+        typedef AgentScanPolicy<
+                CUB_NOMINAL_CONFIG(128, 12, OutputT),      ///< Threads per block, items per thread
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
+                BLOCK_SCAN_RAKING>
+            ScanPolicyT;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        typedef AgentScanPolicy<
+                CUB_NOMINAL_CONFIG(256, 9, OutputT),      ///< Threads per block, items per thread
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        // GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T
+        typedef AgentScanPolicy<
+                CUB_NOMINAL_CONFIG(128, 12, OutputT),      ///< Threads per block, items per thread
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+    /// SM13
+    struct Policy130
+    {
+        typedef AgentScanPolicy<
+                CUB_NOMINAL_CONFIG(96, 21, OutputT),      ///< Threads per block, items per thread
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            ScanPolicyT;
+    };
+
+    /// SM10
+    struct Policy100
+    {
+        typedef AgentScanPolicy<
+                CUB_NOMINAL_CONFIG(64, 9, OutputT),      ///< Threads per block, items per thread
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies of current PTX compiler pass
+    //---------------------------------------------------------------------
+
+#if (CUB_PTX_ARCH >= 600)
+    typedef Policy600 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 520)
+    typedef Policy520 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+    typedef Policy130 PtxPolicy;
+
+#else
+    typedef Policy100 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxAgentScanPolicy : PtxPolicy::ScanPolicyT {};
+
+
+    //---------------------------------------------------------------------
+    // Utilities
+    //---------------------------------------------------------------------
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &scan_kernel_config)
+    {
+    #if (CUB_PTX_ARCH > 0)
+        (void)ptx_version;
+
+        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+        scan_kernel_config.template Init<PtxAgentScanPolicy>();
+
+    #else
+
+        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+        if (ptx_version >= 600)
+        {
+            scan_kernel_config.template Init<typename Policy600::ScanPolicyT>();
+        }
+        else if (ptx_version >= 520)
+        {
+            scan_kernel_config.template Init<typename Policy520::ScanPolicyT>();
+        }
+        else if (ptx_version >= 350)
+        {
+            scan_kernel_config.template Init<typename Policy350::ScanPolicyT>();
+        }
+        else if (ptx_version >= 300)
+        {
+            scan_kernel_config.template Init<typename Policy300::ScanPolicyT>();
+        }
+        else if (ptx_version >= 200)
+        {
+            scan_kernel_config.template Init<typename Policy200::ScanPolicyT>();
+        }
+        else if (ptx_version >= 130)
+        {
+            scan_kernel_config.template Init<typename Policy130::ScanPolicyT>();
+        }
+        else
+        {
+            scan_kernel_config.template Init<typename Policy100::ScanPolicyT>();
+        }
+
+    #endif
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.
+     */
+    struct KernelConfig
+    {
+        int block_threads;
+        int items_per_thread;
+        int tile_items;
+
+        template <typename PolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads       = PolicyT::BLOCK_THREADS;
+            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
+            tile_items          = block_threads * items_per_thread;
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Dispatch entrypoints
+    //---------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide prefix scan using the
+     * specified kernel functions.
+     */
+    template <
+        typename            ScanInitKernelPtrT,     ///< Function type of cub::DeviceScanInitKernel
+        typename            ScanSweepKernelPtrT>    ///< Function type of cub::DeviceScanKernelPtrT
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*               d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                   ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                  ///< [out] Pointer to the output sequence of data items
+        ScanOpT             scan_op,                ///< [in] Binary scan functor 
+        InitValueT          init_value,             ///< [in] Initial value to seed the exclusive scan
+        OffsetT             num_items,              ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t        stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                 /*ptx_version*/,        ///< [in] PTX version of dispatch kernels
+        ScanInitKernelPtrT  init_kernel,            ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
+        ScanSweepKernelPtrT scan_kernel,            ///< [in] Kernel function pointer to parameterization of cub::DeviceScanKernel
+        KernelConfig        scan_kernel_config)     ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+        (void)d_temp_storage;
+        (void)temp_storage_bytes;
+        (void)d_in;
+        (void)d_out;
+        (void)scan_op;
+        (void)init_value;
+        (void)num_items;
+        (void)stream;
+        (void)debug_synchronous;
+        (void)init_kernel;
+        (void)scan_kernel;
+        (void)scan_kernel_config;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = scan_kernel_config.block_threads * scan_kernel_config.items_per_thread;
+            int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[1];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void* allocations[1];
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Construct the tile status interface
+            ScanTileStateT tile_state;
+            if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log init_kernel configuration
+            int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
+            if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke init_kernel to initialize tile descriptors
+            init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
+                tile_state,
+                num_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Get SM occupancy for scan_kernel
+            int scan_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                scan_sm_occupancy,            // out
+                scan_kernel,
+                scan_kernel_config.block_threads))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Run grids in epochs (in case number of tiles exceeds max x-dimension
+            int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
+            for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
+            {
+                // Log scan_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                    start_tile, scan_grid_size, scan_kernel_config.block_threads, (long long) stream, scan_kernel_config.items_per_thread, scan_sm_occupancy);
+
+                // Invoke scan_kernel
+                scan_kernel<<<scan_grid_size, scan_kernel_config.block_threads, 0, stream>>>(
+                    d_in,
+                    d_out,
+                    tile_state,
+                    start_tile,
+                    scan_op,
+                    init_value,
+                    num_items);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*           d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&         temp_storage_bytes,     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                   ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                  ///< [out] Pointer to the output sequence of data items
+        ScanOpT         scan_op,                ///< [in] Binary scan functor 
+        InitValueT      init_value,             ///< [in] Initial value to seed the exclusive scan
+        OffsetT         num_items,              ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream,                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous)      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig scan_kernel_config;
+            InitConfigs(ptx_version, scan_kernel_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_in,
+                d_out,
+                scan_op,
+                init_value,
+                num_items,
+                stream,
+                debug_synchronous,
+                ptx_version,
+                DeviceScanInitKernel<ScanTileStateT>,
+                DeviceScanKernel<PtxAgentScanPolicy, InputIteratorT, OutputIteratorT, ScanTileStateT, ScanOpT, InitValueT, OffsetT>,
+                scan_kernel_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_select_if.cuh b/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_select_if.cuh
new file mode 100644
index 00000000000..6f033197c2d
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_select_if.cuh
@@ -0,0 +1,542 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch_scan.cuh"
+#include "../../agent/agent_select_if.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Select kernel entry point (multi-block)
+ *
+ * Performs functor-based selection if SelectOpT functor type != NullType
+ * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
+ * Otherwise performs discontinuity selection (keep unique)
+ */
+template <
+    typename            AgentSelectIfPolicyT,       ///< Parameterized AgentSelectIfPolicyT tuning policy type
+    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items
+    typename            FlagsInputIteratorT,        ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection)
+    typename            SelectedOutputIteratorT,    ///< Random-access output iterator type for writing selected items
+    typename            NumSelectedIteratorT,       ///< Output iterator type for recording the number of items selected
+    typename            ScanTileStateT,             ///< Tile status interface type
+    typename            SelectOpT,                  ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
+    typename            EqualityOpT,                ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
+    typename            OffsetT,                    ///< Signed integer type for global offsets
+    bool                KEEP_REJECTS>               ///< Whether or not we push rejected items to the back of the output
+__launch_bounds__ (int(AgentSelectIfPolicyT::BLOCK_THREADS))
+__global__ void DeviceSelectSweepKernel(
+    InputIteratorT          d_in,                   ///< [in] Pointer to the input sequence of data items
+    FlagsInputIteratorT     d_flags,                ///< [in] Pointer to the input sequence of selection flags (if applicable)
+    SelectedOutputIteratorT d_selected_out,         ///< [out] Pointer to the output sequence of selected data items
+    NumSelectedIteratorT    d_num_selected_out,     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
+    ScanTileStateT          tile_status,            ///< [in] Tile status interface
+    SelectOpT               select_op,              ///< [in] Selection operator
+    EqualityOpT             equality_op,            ///< [in] Equality operator
+    OffsetT                 num_items,              ///< [in] Total number of input items (i.e., length of \p d_in)
+    int                     num_tiles)              ///< [in] Total number of tiles for the entire problem
+{
+    // Thread block type for selecting data from input tiles
+    typedef AgentSelectIf<
+        AgentSelectIfPolicyT,
+        InputIteratorT,
+        FlagsInputIteratorT,
+        SelectedOutputIteratorT,
+        SelectOpT,
+        EqualityOpT,
+        OffsetT,
+        KEEP_REJECTS> AgentSelectIfT;
+
+    // Shared memory for AgentSelectIf
+    __shared__ typename AgentSelectIfT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentSelectIfT(temp_storage, d_in, d_flags, d_selected_out, select_op, equality_op, num_items).ConsumeRange(
+        num_tiles,
+        tile_status,
+        d_num_selected_out);
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect
+ */
+template <
+    typename    InputIteratorT,                 ///< Random-access input iterator type for reading input items
+    typename    FlagsInputIteratorT,            ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection)
+    typename    SelectedOutputIteratorT,        ///< Random-access output iterator type for writing selected items
+    typename    NumSelectedIteratorT,           ///< Output iterator type for recording the number of items selected
+    typename    SelectOpT,                      ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
+    typename    EqualityOpT,                    ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
+struct DispatchSelectIf
+{
+    /******************************************************************************
+     * Types and constants
+     ******************************************************************************/
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<SelectedOutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                                  // ... then the input iterator's value type,
+        typename std::iterator_traits<SelectedOutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // The flag value type
+    typedef typename std::iterator_traits<FlagsInputIteratorT>::value_type FlagT;
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128,
+    };
+
+    // Tile status descriptor interface type
+    typedef ScanTileState<OffsetT> ScanTileStateT;
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+    /// SM35
+    struct Policy350
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 10,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            SelectIfPolicyT;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 7,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(3, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            SelectIfPolicyT;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = (KEEP_REJECTS) ? 7 : 15,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            SelectIfPolicyT;
+    };
+
+    /// SM13
+    struct Policy130
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                64,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            SelectIfPolicyT;
+    };
+
+    /// SM10
+    struct Policy100
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                64,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_RAKING>
+            SelectIfPolicyT;
+    };
+
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+    typedef Policy130 PtxPolicy;
+
+#else
+    typedef Policy100 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxSelectIfPolicyT : PtxPolicy::SelectIfPolicyT {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &select_if_config)
+    {
+    #if (CUB_PTX_ARCH > 0)
+        (void)ptx_version;
+
+        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+        select_if_config.template Init<PtxSelectIfPolicyT>();
+
+    #else
+
+        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+        if (ptx_version >= 350)
+        {
+            select_if_config.template Init<typename Policy350::SelectIfPolicyT>();
+        }
+        else if (ptx_version >= 300)
+        {
+            select_if_config.template Init<typename Policy300::SelectIfPolicyT>();
+        }
+        else if (ptx_version >= 200)
+        {
+            select_if_config.template Init<typename Policy200::SelectIfPolicyT>();
+        }
+        else if (ptx_version >= 130)
+        {
+            select_if_config.template Init<typename Policy130::SelectIfPolicyT>();
+        }
+        else
+        {
+            select_if_config.template Init<typename Policy100::SelectIfPolicyT>();
+        }
+
+    #endif
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.
+     */
+    struct KernelConfig
+    {
+        int block_threads;
+        int items_per_thread;
+        int tile_items;
+
+        template <typename PolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads       = PolicyT::BLOCK_THREADS;
+            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
+            tile_items          = block_threads * items_per_thread;
+        }
+    };
+
+
+    /******************************************************************************
+     * Dispatch entrypoints
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine for computing a device-wide selection using the
+     * specified kernel functions.
+     */
+    template <
+        typename                    ScanInitKernelPtrT,             ///< Function type of cub::DeviceScanInitKernel
+        typename                    SelectIfKernelPtrT>             ///< Function type of cub::SelectIfKernelPtrT
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        FlagsInputIteratorT         d_flags,                        ///< [in] Pointer to the input sequence of selection flags (if applicable)
+        SelectedOutputIteratorT     d_selected_out,                 ///< [in] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
+        SelectOpT                   select_op,                      ///< [in] Selection operator
+        EqualityOpT                 equality_op,                    ///< [in] Equality operator
+        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                         /*ptx_version*/,                ///< [in] PTX version of dispatch kernels
+        ScanInitKernelPtrT          scan_init_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
+        SelectIfKernelPtrT          select_if_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceSelectSweepKernel
+        KernelConfig                select_if_config)               ///< [in] Dispatch parameters that match the policy that \p select_if_kernel was compiled for
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+        (void)d_temp_storage;
+        (void)temp_storage_bytes;
+        (void)d_in;
+        (void)d_flags;
+        (void)d_selected_out;
+        (void)d_num_selected_out;
+        (void)select_op;
+        (void)equality_op;
+        (void)num_items;
+        (void)stream;
+        (void)debug_synchronous;
+        (void)scan_init_kernel;
+        (void)select_if_kernel;
+        (void)select_if_config;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = select_if_config.block_threads * select_if_config.items_per_thread;
+            int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[1];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void* allocations[1];
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the tile status interface
+            ScanTileStateT tile_status;
+            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log scan_init_kernel configuration
+            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
+            if (debug_synchronous) _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke scan_init_kernel to initialize tile descriptors
+            scan_init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
+                tile_status,
+                num_tiles,
+                d_num_selected_out);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Get SM occupancy for select_if_kernel
+            int range_select_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                range_select_sm_occupancy,            // out
+                select_if_kernel,
+                select_if_config.block_threads))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Get grid size for scanning tiles
+            dim3 scan_grid_size;
+            scan_grid_size.z = 1;
+            scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
+            scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
+
+            // Log select_if_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking select_if_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, select_if_config.block_threads, (long long) stream, select_if_config.items_per_thread, range_select_sm_occupancy);
+
+            // Invoke select_if_kernel
+            select_if_kernel<<<scan_grid_size, select_if_config.block_threads, 0, stream>>>(
+                d_in,
+                d_flags,
+                d_selected_out,
+                d_num_selected_out,
+                tile_status,
+                select_op,
+                equality_op,
+                num_items,
+                num_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        FlagsInputIteratorT         d_flags,                        ///< [in] Pointer to the input sequence of selection flags (if applicable)
+        SelectedOutputIteratorT     d_selected_out,                 ///< [in] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
+        SelectOpT                   select_op,                      ///< [in] Selection operator
+        EqualityOpT                 equality_op,                    ///< [in] Equality operator
+        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous)              ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig select_if_config;
+            InitConfigs(ptx_version, select_if_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_in,
+                d_flags,
+                d_selected_out,
+                d_num_selected_out,
+                select_op,
+                equality_op,
+                num_items,
+                stream,
+                debug_synchronous,
+                ptx_version,
+                DeviceCompactInitKernel<ScanTileStateT, NumSelectedIteratorT>,
+                DeviceSelectSweepKernel<PtxSelectIfPolicyT, InputIteratorT, FlagsInputIteratorT, SelectedOutputIteratorT, NumSelectedIteratorT, ScanTileStateT, SelectOpT, EqualityOpT, OffsetT, KEEP_REJECTS>,
+                select_if_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_spmv_orig.cuh b/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_spmv_orig.cuh
new file mode 100644
index 00000000000..7432d58d69a
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_spmv_orig.cuh
@@ -0,0 +1,942 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV).
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../agent/single_pass_scan_operators.cuh"
+#include "../../agent/agent_segment_fixup.cuh"
+#include "../../agent/agent_spmv_orig.cuh"
+#include "../../util_type.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../thread/thread_search.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * SpMV kernel entry points
+ *****************************************************************************/
+
+/**
+ * Spmv search kernel. Identifies merge path starting coordinates for each tile.
+ */
+template <
+    typename    AgentSpmvPolicyT,           ///< Parameterized SpmvPolicy tuning policy type
+    typename    ValueT,                     ///< Matrix and vector value type
+    typename    OffsetT,                    ///< Signed integer type for sequence offsets
+    typename    SemiringT>                  ///< Semiring operations
+__global__ void DeviceSpmv1ColKernel(
+    SpmvParams<ValueT, OffsetT> spmv_params)                ///< [in] SpMV input parameter bundle
+{
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,
+            ValueT,
+            OffsetT>
+        VectorValueIteratorT;
+
+    VectorValueIteratorT wrapped_vector_x(spmv_params.d_vector_x);
+
+    int row_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (row_idx < spmv_params.num_rows)
+    {
+        OffsetT     end_nonzero_idx = spmv_params.d_row_end_offsets[row_idx];
+        OffsetT     nonzero_idx = spmv_params.d_row_end_offsets[row_idx - 1];
+
+        ValueT value = SemiringT::plus_ident();
+        if (end_nonzero_idx != nonzero_idx)
+        {
+            value = SemiringT::times( spmv_params.alpha,
+                SemiringT::times(spmv_params.d_values[nonzero_idx], wrapped_vector_x[spmv_params.d_column_indices[nonzero_idx]]));
+        }
+
+        spmv_params.d_vector_y[row_idx] = SemiringT::plus(value, SemiringT::times(spmv_params.d_vector_y[row_idx], spmv_params.beta));
+    }
+}
+
+/**
+ * Degenerate case: y = b*y
+ */
+template <
+    typename    ValueT,                     ///< Matrix and vector value type
+    typename    OffsetT,                    ///< Signed integer type for sequence offsets
+    typename    SemiringT>                  ///< Semiring operations
+__global__ void DeviceSpmvbyKernel(
+    SpmvParams<ValueT, OffsetT> spmv_params)                ///< [in] SpMV input parameter bundle
+{
+    int row_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (row_idx < spmv_params.num_rows)
+    {
+        spmv_params.d_vector_y[row_idx] = SemiringT::times(spmv_params.d_vector_y[row_idx], spmv_params.beta);
+    }
+}
+
+
+/**
+ * Spmv search kernel. Identifies merge path starting coordinates for each tile.
+ */
+template <
+    typename    SpmvPolicyT,                    ///< Parameterized SpmvPolicy tuning policy type
+    typename    OffsetT,                        ///< Signed integer type for sequence offsets
+    typename    CoordinateT,                    ///< Merge path coordinate type
+    typename    SpmvParamsT,                    ///< SpmvParams type
+    typename    SemiringT>                      ///< Semiring type
+__global__ void DeviceSpmvSearchKernel(
+    int             num_merge_tiles,            ///< [in] Number of SpMV merge tiles (spmv grid size)
+    CoordinateT*    d_tile_coordinates,         ///< [out] Pointer to the temporary array of tile starting coordinates
+    SpmvParamsT     spmv_params)                ///< [in] SpMV input parameter bundle
+{
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS           = SpmvPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = SpmvPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    typedef CacheModifiedInputIterator<
+            SpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        RowOffsetsSearchIteratorT;
+
+    // Find the starting coordinate for all tiles (plus the end coordinate of the last one)
+    int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (tile_idx < num_merge_tiles + 1)
+    {
+        OffsetT                         diagonal = (tile_idx * TILE_ITEMS);
+        CoordinateT                     tile_coordinate;
+        CountingInputIterator<OffsetT>  nonzero_indices(0);
+
+        // Search the merge path
+        MergePathSearch(
+            diagonal,
+            RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
+            nonzero_indices,
+            spmv_params.num_rows,
+            spmv_params.num_nonzeros,
+            tile_coordinate);
+
+        // Output starting offset
+        d_tile_coordinates[tile_idx] = tile_coordinate;
+    }
+}
+
+
+/**
+ * Spmv agent entry point
+ */
+template <
+    typename        SpmvPolicyT,                ///< Parameterized SpmvPolicy tuning policy type
+    typename        ScanTileStateT,             ///< Tile status interface type
+    typename        ValueT,                     ///< Matrix and vector value type
+    typename        OffsetT,                    ///< Signed integer type for sequence offsets
+    typename        CoordinateT,                ///< Merge path coordinate type
+    typename        SemiringT,                  ///< Semiring type
+    bool            HAS_ALPHA,                  ///< Whether the input parameter Alpha is 1
+    bool            HAS_BETA>                   ///< Whether the input parameter Beta is 0
+__launch_bounds__ (int(SpmvPolicyT::BLOCK_THREADS))
+__global__ void DeviceSpmvKernel(
+    SpmvParams<ValueT, OffsetT>     spmv_params,                ///< [in] SpMV input parameter bundle
+    CoordinateT*                    d_tile_coordinates,         ///< [in] Pointer to the temporary array of tile starting coordinates
+    KeyValuePair<OffsetT,ValueT>*   d_tile_carry_pairs,         ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block
+    int                             num_tiles,                  ///< [in] Number of merge tiles
+    ScanTileStateT                  tile_state,                 ///< [in] Tile status interface for fixup reduce-by-key kernel
+    int                             num_segment_fixup_tiles)    ///< [in] Number of reduce-by-key tiles (fixup grid size)
+{
+    // Spmv agent type specialization
+    typedef AgentSpmv<
+            SpmvPolicyT,
+            ValueT,
+            OffsetT,
+            SemiringT,
+            HAS_ALPHA,
+            HAS_BETA>
+        AgentSpmvT;
+
+    // Shared memory for AgentSpmv
+    __shared__ typename AgentSpmvT::TempStorage temp_storage;
+
+    AgentSpmvT(temp_storage, spmv_params).ConsumeTile(
+        d_tile_coordinates,
+        d_tile_carry_pairs,
+        num_tiles);
+
+    // Initialize fixup tile status
+    tile_state.InitializeStatus(num_segment_fixup_tiles);
+
+}
+
+
+/**
+ * Multi-block reduce-by-key sweep kernel entry point
+ */
+template <
+    typename    AgentSegmentFixupPolicyT,       ///< Parameterized AgentSegmentFixupPolicy tuning policy type
+    typename    PairsInputIteratorT,            ///< Random-access input iterator type for keys
+    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    typename    SemiringT,                      ///< Semiring type
+    typename    ScanTileStateT>                 ///< Tile status interface type
+__launch_bounds__ (int(AgentSegmentFixupPolicyT::BLOCK_THREADS))
+__global__ void DeviceSegmentFixupKernel(
+    OffsetT                     max_items,          ///< [in] Limit on number of output items (number of rows). Used to prevent OOB writes.
+    PairsInputIteratorT         d_pairs_in,         ///< [in] Pointer to the array carry-out dot product row-ids, one per spmv block
+    AggregatesOutputIteratorT   d_aggregates_out,   ///< [in,out] Output value aggregates
+    OffsetT                     num_items,          ///< [in] Total number of items to select from
+    int                         num_tiles,          ///< [in] Total number of tiles for the entire problem
+    ScanTileStateT              tile_state)         ///< [in] Tile status interface
+{
+    // Thread block type for reducing tiles of value segments
+    typedef AgentSegmentFixup<
+            AgentSegmentFixupPolicyT,
+            PairsInputIteratorT,
+            AggregatesOutputIteratorT,
+            cub::Equality,
+            typename SemiringT::SumOp,
+            OffsetT,
+            SemiringT>
+        AgentSegmentFixupT;
+
+    // Shared memory for AgentSegmentFixup
+    __shared__ typename AgentSegmentFixupT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentSegmentFixupT(temp_storage, d_pairs_in, d_aggregates_out, cub::Equality(), SemiringT::SumOp()).ConsumeRange(
+        max_items,
+        num_items,
+        num_tiles,
+        tile_state);
+}
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceSpmv
+ */
+template <
+    typename    ValueT,                     ///< Matrix and vector value type
+    typename    OffsetT,                    ///< Signed integer type for global offsets
+    typename    SemiringT>                  ///< Semiring type
+struct DispatchSpmv
+{
+    //---------------------------------------------------------------------
+    // Constants and Types
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128
+    };
+
+    // SpmvParams bundle type
+    typedef SpmvParams<ValueT, OffsetT> SpmvParamsT;
+
+    // 2D merge path coordinate type
+    typedef typename CubVector<OffsetT, 2>::Type CoordinateT;
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<ValueT, OffsetT> ScanTileStateT;
+
+    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
+    typedef KeyValuePair<OffsetT, ValueT> KeyValuePairT;
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies
+    //---------------------------------------------------------------------
+
+    /// SM11
+    struct Policy110
+    {
+        typedef AgentSpmvPolicy<
+                128,
+                1,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                4,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+    };
+
+    /// SM20
+    struct Policy200 
+    {
+        typedef AgentSpmvPolicy<
+                96,
+                18,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_RAKING>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                4,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+
+    };
+
+
+
+    /// SM30
+    struct Policy300 
+    {
+        typedef AgentSpmvPolicy<
+                96,
+                6,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                4,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+
+    };
+
+
+    /// SM35
+    struct Policy350
+    {
+        typedef AgentSpmvPolicy<
+                (sizeof(ValueT) > 4) ? 96 : 128,
+                (sizeof(ValueT) > 4) ? 4 : 7,
+                LOAD_LDG,
+                LOAD_CA,
+                LOAD_LDG,
+                LOAD_LDG,
+                LOAD_LDG,
+                (sizeof(ValueT) > 4) ? true : false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                3,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+    };
+
+
+    /// SM37
+    struct Policy370
+    {
+
+        typedef AgentSpmvPolicy<
+                (sizeof(ValueT) > 4) ? 128 : 128,
+                (sizeof(ValueT) > 4) ? 9 : 14,
+                LOAD_LDG,
+                LOAD_CA,
+                LOAD_LDG,
+                LOAD_LDG,
+                LOAD_LDG,
+                false, 
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                3,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+    };
+
+    /// SM50
+    struct Policy500
+    {
+        typedef AgentSpmvPolicy<
+                (sizeof(ValueT) > 4) ? 64 : 128,
+                (sizeof(ValueT) > 4) ? 6 : 7,
+                LOAD_LDG,
+                LOAD_DEFAULT,
+                (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,
+                (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,
+                LOAD_LDG,
+                (sizeof(ValueT) > 4) ? true : false,
+                (sizeof(ValueT) > 4) ? BLOCK_SCAN_WARP_SCANS : BLOCK_SCAN_RAKING_MEMOIZE>
+            SpmvPolicyT;
+
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                3,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_LDG,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            SegmentFixupPolicyT;
+    };
+
+
+    /// SM60
+    struct Policy600
+    {
+        typedef AgentSpmvPolicy<
+                (sizeof(ValueT) > 4) ? 64 : 128,
+                (sizeof(ValueT) > 4) ? 5 : 7,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                3,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+    };
+
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies of current PTX compiler pass
+    //---------------------------------------------------------------------
+
+#if (CUB_PTX_ARCH >= 600)
+    typedef Policy600 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 500)
+    typedef Policy500 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 370)
+    typedef Policy370 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#else
+    typedef Policy110 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxSpmvPolicyT : PtxPolicy::SpmvPolicyT {};
+    struct PtxSegmentFixupPolicy : PtxPolicy::SegmentFixupPolicyT {};
+
+
+    //---------------------------------------------------------------------
+    // Utilities
+    //---------------------------------------------------------------------
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &spmv_config,
+        KernelConfig    &segment_fixup_config)
+    {
+    #if (CUB_PTX_ARCH > 0)
+
+        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+        spmv_config.template Init<PtxSpmvPolicyT>();
+        segment_fixup_config.template Init<PtxSegmentFixupPolicy>();
+
+    #else
+
+        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+        if (ptx_version >= 600)
+        {
+            spmv_config.template            Init<typename Policy600::SpmvPolicyT>();
+            segment_fixup_config.template   Init<typename Policy600::SegmentFixupPolicyT>();
+        }
+        else if (ptx_version >= 500)
+        {
+            spmv_config.template            Init<typename Policy500::SpmvPolicyT>();
+            segment_fixup_config.template   Init<typename Policy500::SegmentFixupPolicyT>();
+        }
+        else if (ptx_version >= 370)
+        {
+            spmv_config.template            Init<typename Policy370::SpmvPolicyT>();
+            segment_fixup_config.template   Init<typename Policy370::SegmentFixupPolicyT>();
+        }
+        else if (ptx_version >= 350)
+        {
+            spmv_config.template            Init<typename Policy350::SpmvPolicyT>();
+            segment_fixup_config.template   Init<typename Policy350::SegmentFixupPolicyT>();
+        }
+        else if (ptx_version >= 300)
+        {
+            spmv_config.template            Init<typename Policy300::SpmvPolicyT>();
+            segment_fixup_config.template   Init<typename Policy300::SegmentFixupPolicyT>();
+
+        }
+        else if (ptx_version >= 200)
+        {
+            spmv_config.template            Init<typename Policy200::SpmvPolicyT>();
+            segment_fixup_config.template   Init<typename Policy200::SegmentFixupPolicyT>();
+        }
+        else
+        {
+            spmv_config.template            Init<typename Policy110::SpmvPolicyT>();
+            segment_fixup_config.template   Init<typename Policy110::SegmentFixupPolicyT>();
+        }
+
+    #endif
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.
+     */
+    struct KernelConfig
+    {
+        int block_threads;
+        int items_per_thread;
+        int tile_items;
+
+        template <typename PolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads       = PolicyT::BLOCK_THREADS;
+            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
+            tile_items          = block_threads * items_per_thread;
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Dispatch entrypoints
+    //---------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction using the
+     * specified kernel functions.
+     *
+     * If the input is larger than a single tile, this method uses two-passes of
+     * kernel invocations.
+     */
+    template <
+        typename                Spmv1ColKernelT,                    ///< Function type of cub::DeviceSpmv1ColKernel
+        typename                SpmvbyKernelT,
+        typename                SpmvSearchKernelT,                  ///< Function type of cub::AgentSpmvSearchKernel
+        typename                SpmvKernelT,                        ///< Function type of cub::AgentSpmvKernel
+        typename                SegmentFixupKernelT>                 ///< Function type of cub::DeviceSegmentFixupKernelT
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                   d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                 temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SpmvParamsT&            spmv_params,                        ///< SpMV input parameter bundle
+        cudaStream_t            stream,                             ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous,                  ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        SpmvbyKernelT           spmv_by_kernel,
+        Spmv1ColKernelT         spmv_1col_kernel,                   ///< [in] Kernel function pointer to parameterization of DeviceSpmv1ColKernel
+        SpmvSearchKernelT       spmv_search_kernel,                 ///< [in] Kernel function pointer to parameterization of AgentSpmvSearchKernel
+        SpmvKernelT             spmv_kernel,                        ///< [in] Kernel function pointer to parameterization of AgentSpmvKernel
+        SegmentFixupKernelT     segment_fixup_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel
+        KernelConfig            spmv_config,                        ///< [in] Dispatch parameters that match the policy that \p spmv_kernel was compiled for
+        KernelConfig            segment_fixup_config)               ///< [in] Dispatch parameters that match the policy that \p segment_fixup_kernel was compiled for
+    {
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            // degenerate case of y = beta*y
+            if (spmv_params.alpha == SemiringT::times_null())
+            {
+                if (d_temp_storage == NULL)
+                {
+                    // Return if the caller is simply requesting the size of the storage allocation
+                    temp_storage_bytes = 1;
+                    break;
+                }
+
+                // Get search/init grid dims
+                int degen_by_block_size     = INIT_KERNEL_THREADS;
+                int degen_by_grid_size      = (spmv_params.num_rows + degen_by_block_size - 1) / degen_by_block_size;
+
+                if (debug_synchronous) _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n",
+                    degen_by_grid_size, degen_by_block_size, (long long) stream);
+
+                // Invoke spmv_search_kernel
+                spmv_by_kernel<<<degen_by_grid_size, degen_by_block_size, 0, stream>>>(
+                    spmv_params);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+                break;
+            }
+
+            if (spmv_params.num_cols == 1)
+            {
+                if (d_temp_storage == NULL)
+                {
+                    // Return if the caller is simply requesting the size of the storage allocation
+                    temp_storage_bytes = 1;
+                    break;
+                }
+
+                // Get search/init grid dims
+                int degen_col_kernel_block_size     = INIT_KERNEL_THREADS;
+                int degen_col_kernel_grid_size      = (spmv_params.num_rows + degen_col_kernel_block_size - 1) / degen_col_kernel_block_size;
+
+                if (debug_synchronous) _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n",
+                    degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream);
+
+                // Invoke spmv_search_kernel
+                spmv_1col_kernel<<<degen_col_kernel_grid_size, degen_col_kernel_block_size, 0, stream>>>(
+                    spmv_params);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+                break;
+            }
+
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x_i;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x_i, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+            unsigned int max_dim_x = max_dim_x_i;
+
+            // Total number of spmv work items
+            int num_merge_items = spmv_params.num_rows + spmv_params.num_nonzeros;
+
+            // Tile sizes of kernels
+            int merge_tile_size              = spmv_config.block_threads * spmv_config.items_per_thread;
+            int segment_fixup_tile_size     = segment_fixup_config.block_threads * segment_fixup_config.items_per_thread;
+
+            // Number of tiles for kernels
+            unsigned int num_merge_tiles            = (num_merge_items + merge_tile_size - 1) / merge_tile_size;
+            unsigned int num_segment_fixup_tiles    = (num_merge_tiles + segment_fixup_tile_size - 1) / segment_fixup_tile_size;
+
+            // Get SM occupancy for kernels
+            int spmv_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                spmv_sm_occupancy,
+                spmv_kernel,
+                spmv_config.block_threads))) break;
+
+            int segment_fixup_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                segment_fixup_sm_occupancy,
+                segment_fixup_kernel,
+                segment_fixup_config.block_threads))) break;
+
+            // Get grid dimensions
+            dim3 spmv_grid_size(
+                CUB_MIN(num_merge_tiles, max_dim_x),
+                (num_merge_tiles + max_dim_x - 1) / max_dim_x,
+                1);
+
+            dim3 segment_fixup_grid_size(
+                CUB_MIN(num_segment_fixup_tiles, max_dim_x),
+                (num_segment_fixup_tiles + max_dim_x - 1) / max_dim_x,
+                1);
+
+            // Get the temporary storage allocation requirements
+            size_t allocation_sizes[3];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_segment_fixup_tiles, allocation_sizes[0]))) break;    // bytes needed for reduce-by-key tile status descriptors
+            allocation_sizes[1] = num_merge_tiles * sizeof(KeyValuePairT);       // bytes needed for block carry-out pairs
+            allocation_sizes[2] = (num_merge_tiles + 1) * sizeof(CoordinateT);   // bytes needed for tile starting coordinates
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            void* allocations[3];
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the tile status interface
+            ScanTileStateT tile_state;
+            if (CubDebug(error = tile_state.Init(num_segment_fixup_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Alias the other allocations
+            KeyValuePairT*  d_tile_carry_pairs      = (KeyValuePairT*) allocations[1];  // Agent carry-out pairs
+            CoordinateT*    d_tile_coordinates      = (CoordinateT*) allocations[2];    // Agent starting coordinates
+
+            // Get search/init grid dims
+            int search_block_size   = INIT_KERNEL_THREADS;
+            int search_grid_size    = (num_merge_tiles + 1 + search_block_size - 1) / search_block_size;
+
+#if (CUB_PTX_ARCH == 0)
+            // Init textures
+            if (CubDebug(error = spmv_params.t_vector_x.BindTexture(spmv_params.d_vector_x))) break;
+#endif
+
+            if (search_grid_size < sm_count)
+//            if (num_merge_tiles < spmv_sm_occupancy * sm_count)
+            {
+                // Not enough spmv tiles to saturate the device: have spmv blocks search their own staring coords
+                d_tile_coordinates = NULL;
+            }
+            else
+            {
+                // Use separate search kernel if we have enough spmv tiles to saturate the device
+
+                // Log spmv_search_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n",
+                    search_grid_size, search_block_size, (long long) stream);
+
+                // Invoke spmv_search_kernel
+                spmv_search_kernel<<<search_grid_size, search_block_size, 0, stream>>>(
+                    num_merge_tiles,
+                    d_tile_coordinates,
+                    spmv_params);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+
+            // Log spmv_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy);
+
+            // Invoke spmv_kernel
+            spmv_kernel<<<spmv_grid_size, spmv_config.block_threads, 0, stream>>>(
+                spmv_params,
+                d_tile_coordinates,
+                d_tile_carry_pairs,
+                num_merge_tiles,
+                tile_state,
+                num_segment_fixup_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Run reduce-by-key fixup if necessary
+            if (num_merge_tiles > 1)
+            {
+                // Log segment_fixup_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                    segment_fixup_grid_size.x, segment_fixup_grid_size.y, segment_fixup_grid_size.z, segment_fixup_config.block_threads, (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy);
+
+                // Invoke segment_fixup_kernel
+                segment_fixup_kernel<<<segment_fixup_grid_size, segment_fixup_config.block_threads, 0, stream>>>(
+                    spmv_params.num_rows,
+                    d_tile_carry_pairs,
+                    spmv_params.d_vector_y,
+                    num_merge_tiles,
+                    num_segment_fixup_tiles,
+                    tile_state);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+
+#if (CUB_PTX_ARCH == 0)
+            // Free textures
+            if (CubDebug(error = spmv_params.t_vector_x.UnbindTexture())) break;
+#endif
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                   d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                 temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SpmvParamsT&            spmv_params,                        ///< SpMV input parameter bundle
+        cudaStream_t            stream                  = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous       = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig spmv_config, segment_fixup_config;
+            InitConfigs(ptx_version, spmv_config, segment_fixup_config);
+
+            // Dispatch
+            if (spmv_params.beta == SemiringT::times_null())
+            {
+                if (spmv_params.alpha == SemiringT::times_ident())
+                {
+                    // Dispatch y = A*x
+                    if (CubDebug(error = Dispatch(
+                        d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
+                        DeviceSpmvbyKernel<ValueT, OffsetT, SemiringT>,
+                        DeviceSpmv1ColKernel<PtxSpmvPolicyT, ValueT, OffsetT, SemiringT>,
+                        DeviceSpmvSearchKernel<PtxSpmvPolicyT, OffsetT, CoordinateT, SpmvParamsT, SemiringT>,
+                        DeviceSpmvKernel<PtxSpmvPolicyT, ScanTileStateT, ValueT, OffsetT, CoordinateT, SemiringT, false, false>,
+                        DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, SemiringT, ScanTileStateT>,
+                        spmv_config, segment_fixup_config))) break;
+                }
+                else
+                {
+                    // Dispatch y = alpha*A*x
+                    if (CubDebug(error = Dispatch(
+                        d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
+                        DeviceSpmvbyKernel<ValueT, OffsetT, SemiringT>,
+                        DeviceSpmv1ColKernel<PtxSpmvPolicyT, ValueT, OffsetT, SemiringT>,
+                        DeviceSpmvSearchKernel<PtxSpmvPolicyT, OffsetT, CoordinateT, SpmvParamsT, SemiringT>,
+                        DeviceSpmvKernel<PtxSpmvPolicyT, ScanTileStateT, ValueT, OffsetT, CoordinateT, SemiringT, true, false>,
+                        DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, SemiringT, ScanTileStateT>,
+                        spmv_config, segment_fixup_config))) break;
+                }
+            }
+            else
+            {
+                if (spmv_params.alpha == SemiringT::times_ident())
+                {
+                    // Dispatch y = A*x + beta*y
+                    if (CubDebug(error = Dispatch(
+                        d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
+                        DeviceSpmvbyKernel<ValueT, OffsetT, SemiringT>,
+                        DeviceSpmv1ColKernel<PtxSpmvPolicyT, ValueT, OffsetT, SemiringT>,
+                        DeviceSpmvSearchKernel<PtxSpmvPolicyT, OffsetT, CoordinateT, SpmvParamsT, SemiringT>,
+                        DeviceSpmvKernel<PtxSpmvPolicyT, ScanTileStateT, ValueT, OffsetT, CoordinateT, SemiringT, false, true>,
+                        DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, SemiringT, ScanTileStateT>,
+                        spmv_config, segment_fixup_config))) break;
+                }
+                else
+                {
+                    // Dispatch y = alpha*A*x + beta*y
+                    if (CubDebug(error = Dispatch(
+                        d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
+                        DeviceSpmvbyKernel<ValueT, OffsetT, SemiringT>,
+                        DeviceSpmv1ColKernel<PtxSpmvPolicyT, ValueT, OffsetT, SemiringT>,
+                        DeviceSpmvSearchKernel<PtxSpmvPolicyT, OffsetT, CoordinateT, SpmvParamsT, SemiringT>,
+                        DeviceSpmvKernel<PtxSpmvPolicyT, ScanTileStateT, ValueT, OffsetT, CoordinateT, SemiringT, true, true>,
+                        DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, SemiringT, ScanTileStateT>,
+                        spmv_config, segment_fixup_config))) break;
+                }
+            }
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/cpp/nvgraph/external/cub_semiring/grid/grid_barrier.cuh b/cpp/nvgraph/external/cub_semiring/grid/grid_barrier.cuh
new file mode 100644
index 00000000000..d9f83360b9e
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/grid/grid_barrier.cuh
@@ -0,0 +1,211 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid
+ */
+
+#pragma once
+
+#include "../util_debug.cuh"
+#include "../util_namespace.cuh"
+#include "../thread/thread_load.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/**
+ * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid
+ */
+class GridBarrier
+{
+protected :
+
+    typedef unsigned int SyncFlag;
+
+    // Counters in global device memory
+    SyncFlag* d_sync;
+
+public:
+
+    /**
+     * Constructor
+     */
+    GridBarrier() : d_sync(NULL) {}
+
+
+    /**
+     * Synchronize
+     */
+    __device__ __forceinline__ void Sync() const
+    {
+        volatile SyncFlag *d_vol_sync = d_sync;
+
+        // Threadfence and syncthreads to make sure global writes are visible before
+        // thread-0 reports in with its sync counter
+        __threadfence();
+        CTA_SYNC();
+
+        if (blockIdx.x == 0)
+        {
+            // Report in ourselves
+            if (threadIdx.x == 0)
+            {
+                d_vol_sync[blockIdx.x] = 1;
+            }
+
+            CTA_SYNC();
+
+            // Wait for everyone else to report in
+            for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
+            {
+                while (ThreadLoad<LOAD_CG>(d_sync + peer_block) == 0)
+                {
+                    __threadfence_block();
+                }
+            }
+
+            CTA_SYNC();
+
+            // Let everyone know it's safe to proceed
+            for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
+            {
+                d_vol_sync[peer_block] = 0;
+            }
+        }
+        else
+        {
+            if (threadIdx.x == 0)
+            {
+                // Report in
+                d_vol_sync[blockIdx.x] = 1;
+
+                // Wait for acknowledgment
+                while (ThreadLoad<LOAD_CG>(d_sync + blockIdx.x) == 1)
+                {
+                    __threadfence_block();
+                }
+            }
+
+            CTA_SYNC();
+        }
+    }
+};
+
+
+/**
+ * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation.
+ *
+ * Uses RAII for lifetime, i.e., device resources are reclaimed when
+ * the destructor is called.
+ */
+class GridBarrierLifetime : public GridBarrier
+{
+protected:
+
+    // Number of bytes backed by d_sync
+    size_t sync_bytes;
+
+public:
+
+    /**
+     * Constructor
+     */
+    GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {}
+
+
+    /**
+     * DeviceFrees and resets the progress counters
+     */
+    cudaError_t HostReset()
+    {
+        cudaError_t retval = cudaSuccess;
+        if (d_sync)
+        {
+            CubDebug(retval = cudaFree(d_sync));
+            d_sync = NULL;
+        }
+        sync_bytes = 0;
+        return retval;
+    }
+
+
+    /**
+     * Destructor
+     */
+    virtual ~GridBarrierLifetime()
+    {
+        HostReset();
+    }
+
+
+    /**
+     * Sets up the progress counters for the next kernel launch (lazily
+     * allocating and initializing them if necessary)
+     */
+    cudaError_t Setup(int sweep_grid_size)
+    {
+        cudaError_t retval = cudaSuccess;
+        do {
+            size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
+            if (new_sync_bytes > sync_bytes)
+            {
+                if (d_sync)
+                {
+                    if (CubDebug(retval = cudaFree(d_sync))) break;
+                }
+
+                sync_bytes = new_sync_bytes;
+
+                // Allocate and initialize to zero
+                if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break;
+                if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break;
+            }
+        } while (0);
+
+        return retval;
+    }
+};
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/grid/grid_even_share.cuh b/cpp/nvgraph/external/cub_semiring/grid/grid_even_share.cuh
new file mode 100644
index 00000000000..3ba29da7ae6
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/grid/grid_even_share.cuh
@@ -0,0 +1,222 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-share" fashion.  Each thread block gets roughly the same number of fixed-size work units (grains).
+ */
+
+
+#pragma once
+
+#include "../util_namespace.cuh"
+#include "../util_macro.cuh"
+#include "grid_mapping.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/**
+ * \brief GridEvenShare is a descriptor utility for distributing input among
+ * CUDA thread blocks in an "even-share" fashion.  Each thread block gets roughly
+ * the same number of input tiles.
+ *
+ * \par Overview
+ * Each thread block is assigned a consecutive sequence of input tiles.  To help
+ * preserve alignment and eliminate the overhead of guarded loads for all but the
+ * last thread block, to GridEvenShare assigns one of three different amounts of
+ * work to a given thread block: "big", "normal", or "last".  The "big" workloads
+ * are one scheduling grain larger than "normal".  The "last" work unit for the
+ * last thread block may be partially-full if the input is not an even multiple of
+ * the scheduling grain size.
+ *
+ * \par
+ * Before invoking a child grid, a parent thread will typically construct an
+ * instance of GridEvenShare.  The instance can be passed to child thread blocks
+ * which can initialize their per-thread block offsets using \p BlockInit().
+ */
+template <typename OffsetT>
+struct GridEvenShare
+{
+private:
+
+    OffsetT     total_tiles;
+    int         big_shares;
+    OffsetT     big_share_items;
+    OffsetT     normal_share_items;
+    OffsetT     normal_base_offset;
+
+public:
+
+    /// Total number of input items
+    OffsetT     num_items;
+
+    /// Grid size in thread blocks
+    int         grid_size;
+
+    /// OffsetT into input marking the beginning of the owning thread block's segment of input tiles
+    OffsetT     block_offset;
+
+    /// OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles
+    OffsetT     block_end;
+
+    /// Stride between input tiles
+    OffsetT     block_stride;
+
+
+    /**
+     * \brief Constructor.
+     */
+    __host__ __device__ __forceinline__ GridEvenShare() :
+        total_tiles(0),
+        big_shares(0),
+        big_share_items(0),
+        normal_share_items(0),
+        normal_base_offset(0),
+        num_items(0),
+        grid_size(0),
+        block_offset(0),
+        block_end(0),
+        block_stride(0)
+    {}
+
+
+    /**
+     * \brief Dispatch initializer. To be called prior prior to kernel launch.
+     */
+    __host__ __device__ __forceinline__ void DispatchInit(
+        OffsetT num_items,          ///< Total number of input items
+        int     max_grid_size,      ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items)
+        int     tile_items)         ///< Number of data items per input tile
+    {
+        this->block_offset          = num_items;    // Initialize past-the-end
+        this->block_end             = num_items;    // Initialize past-the-end
+        this->num_items             = num_items;
+        this->total_tiles           = (num_items + tile_items - 1) / tile_items;
+        this->grid_size             = CUB_MIN(total_tiles, max_grid_size);
+        OffsetT avg_tiles_per_block = total_tiles / grid_size;
+        this->big_shares            = total_tiles - (avg_tiles_per_block * grid_size);        // leftover grains go to big blocks
+        this->normal_share_items    = avg_tiles_per_block * tile_items;
+        this->normal_base_offset    = big_shares * tile_items;
+        this->big_share_items       = normal_share_items + tile_items;
+    }
+
+
+    /**
+     * \brief Initializes ranges for the specified thread block index.  Specialized
+     * for a "raking" access pattern in which each thread block is assigned a
+     * consecutive sequence of input tiles.
+     */
+    template <int TILE_ITEMS>
+    __device__ __forceinline__ void BlockInit(
+        int block_id,
+        Int2Type<GRID_MAPPING_RAKE> /*strategy_tag*/)
+    {
+        block_stride = TILE_ITEMS;
+        if (block_id < big_shares)
+        {
+            // This thread block gets a big share of grains (avg_tiles_per_block + 1)
+            block_offset = (block_id * big_share_items);
+            block_end = block_offset + big_share_items;
+        }
+        else if (block_id < total_tiles)
+        {
+            // This thread block gets a normal share of grains (avg_tiles_per_block)
+            block_offset = normal_base_offset + (block_id * normal_share_items);
+            block_end = CUB_MIN(num_items, block_offset + normal_share_items);
+        }
+        // Else default past-the-end
+    }
+
+
+    /**
+     * \brief Block-initialization, specialized for a "raking" access
+     * pattern in which each thread block is assigned a consecutive sequence
+     * of input tiles.
+     */
+    template <int TILE_ITEMS>
+    __device__ __forceinline__ void BlockInit(
+        int block_id,
+        Int2Type<GRID_MAPPING_STRIP_MINE> /*strategy_tag*/)
+    {
+        block_stride = grid_size * TILE_ITEMS;
+        block_offset = (block_id * TILE_ITEMS);
+        block_end = num_items;
+    }
+
+
+    /**
+     * \brief Block-initialization, specialized for "strip mining" access
+     * pattern in which the input tiles assigned to each thread block are
+     * separated by a stride equal to the the extent of the grid.
+     */
+    template <
+        int TILE_ITEMS,
+        GridMappingStrategy STRATEGY>
+    __device__ __forceinline__ void BlockInit()
+    {
+        BlockInit<TILE_ITEMS>(blockIdx.x, Int2Type<STRATEGY>());
+    }
+
+
+    /**
+     * \brief Block-initialization, specialized for a "raking" access
+     * pattern in which each thread block is assigned a consecutive sequence
+     * of input tiles.
+     */
+    template <int TILE_ITEMS>
+    __device__ __forceinline__ void BlockInit(
+        OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        OffsetT block_end)                          ///< [in] Threadblock end offset (exclusive)
+    {
+        this->block_offset = block_offset;
+        this->block_end = block_end;
+        this->block_stride = TILE_ITEMS;
+    }
+
+
+};
+
+
+
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cpp/nvgraph/external/cub_semiring/grid/grid_mapping.cuh b/cpp/nvgraph/external/cub_semiring/grid/grid_mapping.cuh
new file mode 100644
index 00000000000..6cd89209f83
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/grid/grid_mapping.cuh
@@ -0,0 +1,113 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
+ */
+
+#pragma once
+
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/******************************************************************************
+ * Mapping policies
+ *****************************************************************************/
+
+
+/**
+ * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
+ */
+enum GridMappingStrategy
+{
+    /**
+     * \brief An a "raking" access pattern in which each thread block is
+     * assigned a consecutive sequence of input tiles
+     *
+     * \par Overview
+     * The input is evenly partitioned into \p p segments, where \p p is
+     * constant and corresponds loosely to the number of thread blocks that may
+     * actively reside on the target device. Each segment is comprised of
+     * consecutive tiles, where a tile is a small, constant-sized unit of input
+     * to be processed to completion before the thread block terminates or
+     * obtains more work.  The kernel invokes \p p thread blocks, each
+     * of which iteratively consumes a segment of <em>n</em>/<em>p</em> elements
+     * in tile-size increments.
+     */
+    GRID_MAPPING_RAKE,
+
+    /**
+     * \brief An a "strip mining" access pattern in which the input tiles assigned
+     * to each thread block are separated by a stride equal to the the extent of
+     * the grid.
+     *
+     * \par Overview
+     * The input is evenly partitioned into \p p sets, where \p p is
+     * constant and corresponds loosely to the number of thread blocks that may
+     * actively reside on the target device. Each set is comprised of
+     * data tiles separated by stride \p tiles, where a tile is a small,
+     * constant-sized unit of input to be processed to completion before the
+     * thread block terminates or obtains more work.  The kernel invokes \p p
+     * thread blocks, each of which iteratively consumes a segment of
+     * <em>n</em>/<em>p</em> elements in tile-size increments.
+     */
+    GRID_MAPPING_STRIP_MINE,
+
+    /**
+     * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
+     *
+     * \par Overview
+     * The input is treated as a queue to be dynamically consumed by a grid of
+     * thread blocks.  Work is atomically dequeued in tiles, where a tile is a
+     * unit of input to be processed to completion before the thread block
+     * terminates or obtains more work.  The grid size \p p is constant,
+     * loosely corresponding to the number of thread blocks that may actively
+     * reside on the target device.
+     */
+    GRID_MAPPING_DYNAMIC,
+};
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/grid/grid_queue.cuh b/cpp/nvgraph/external/cub_semiring/grid/grid_queue.cuh
new file mode 100644
index 00000000000..f413c6d2c4a
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/grid/grid_queue.cuh
@@ -0,0 +1,220 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridQueue is a descriptor utility for dynamic queue management.
+ */
+
+#pragma once
+
+#include "../util_namespace.cuh"
+#include "../util_debug.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/**
+ * \brief GridQueue is a descriptor utility for dynamic queue management.
+ *
+ * \par Overview
+ * GridQueue descriptors provides abstractions for "filling" or
+ * "draining" globally-shared vectors.
+ *
+ * \par
+ * A "filling" GridQueue works by atomically-adding to a zero-initialized counter,
+ * returning a unique offset for the calling thread to write its items.
+ * The GridQueue maintains the total "fill-size".  The fill counter must be reset
+ * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that
+ * will be filling.
+ *
+ * \par
+ * Similarly, a "draining" GridQueue works by works by atomically-incrementing a
+ * zero-initialized counter, returning a unique offset for the calling thread to
+ * read its items. Threads can safely drain until the array's logical fill-size is
+ * exceeded.  The drain counter must be reset using GridQueue::ResetDrain or
+ * GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that
+ * will be filling.  (For dynamic work distribution of existing data, the corresponding fill-size
+ * is simply the number of elements in the array.)
+ *
+ * \par
+ * Iterative work management can be implemented simply with a pair of flip-flopping
+ * work buffers, each with an associated set of fill and drain GridQueue descriptors.
+ *
+ * \tparam OffsetT Signed integer type for global offsets
+ */
+template <typename OffsetT>
+class GridQueue
+{
+private:
+
+    /// Counter indices
+    enum
+    {
+        FILL    = 0,
+        DRAIN   = 1,
+    };
+
+    /// Pair of counters
+    OffsetT *d_counters;
+
+public:
+
+    /// Returns the device allocation size in bytes needed to construct a GridQueue instance
+    __host__ __device__ __forceinline__
+    static size_t AllocationSize()
+    {
+        return sizeof(OffsetT) * 2;
+    }
+
+
+    /// Constructs an invalid GridQueue descriptor
+    __host__ __device__ __forceinline__ GridQueue()
+    :
+        d_counters(NULL)
+    {}
+
+
+    /// Constructs a GridQueue descriptor around the device storage allocation
+    __host__ __device__ __forceinline__ GridQueue(
+        void *d_storage)                    ///< Device allocation to back the GridQueue.  Must be at least as big as <tt>AllocationSize()</tt>.
+    :
+        d_counters((OffsetT*) d_storage)
+    {}
+
+
+    /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance.  To be called by the host or by a kernel prior to that which will be draining.
+    __host__ __device__ __forceinline__ cudaError_t FillAndResetDrain(
+        OffsetT fill_size,
+        cudaStream_t stream = 0)
+    {
+#if (CUB_PTX_ARCH > 0)
+        (void)stream;
+        d_counters[FILL] = fill_size;
+        d_counters[DRAIN] = 0;
+        return cudaSuccess;
+#else
+        OffsetT counters[2];
+        counters[FILL] = fill_size;
+        counters[DRAIN] = 0;
+        return CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream));
+#endif
+    }
+
+
+    /// This operation resets the drain so that it may advance to meet the existing fill-size.  To be called by the host or by a kernel prior to that which will be draining.
+    __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0)
+    {
+#if (CUB_PTX_ARCH > 0)
+        (void)stream;
+        d_counters[DRAIN] = 0;
+        return cudaSuccess;
+#else
+        return CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream));
+#endif
+    }
+
+
+    /// This operation resets the fill counter.  To be called by the host or by a kernel prior to that which will be filling.
+    __host__ __device__ __forceinline__ cudaError_t ResetFill(cudaStream_t stream = 0)
+    {
+#if (CUB_PTX_ARCH > 0)
+        (void)stream;
+        d_counters[FILL] = 0;
+        return cudaSuccess;
+#else
+        return CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream));
+#endif
+    }
+
+
+    /// Returns the fill-size established by the parent or by the previous kernel.
+    __host__ __device__ __forceinline__ cudaError_t FillSize(
+        OffsetT &fill_size,
+        cudaStream_t stream = 0)
+    {
+#if (CUB_PTX_ARCH > 0)
+        (void)stream;
+        fill_size = d_counters[FILL];
+        return cudaSuccess;
+#else
+        return CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream));
+#endif
+    }
+
+
+    /// Drain \p num_items from the queue.  Returns offset from which to read items.  To be called from CUDA kernel.
+    __device__ __forceinline__ OffsetT Drain(OffsetT num_items)
+    {
+        return atomicAdd(d_counters + DRAIN, num_items);
+    }
+
+
+    /// Fill \p num_items into the queue.  Returns offset from which to write items.    To be called from CUDA kernel.
+    __device__ __forceinline__ OffsetT Fill(OffsetT num_items)
+    {
+        return atomicAdd(d_counters + FILL, num_items);
+    }
+};
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/**
+ * Reset grid queue (call with 1 block of 1 thread)
+ */
+template <typename OffsetT>
+__global__ void FillAndResetDrainKernel(
+    GridQueue<OffsetT>   grid_queue,
+    OffsetT              num_items)
+{
+    grid_queue.FillAndResetDrain(num_items);
+}
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/cpp/nvgraph/external/cub_semiring/host/mutex.cuh b/cpp/nvgraph/external/cub_semiring/host/mutex.cuh
new file mode 100644
index 00000000000..0054f1f916d
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/host/mutex.cuh
@@ -0,0 +1,171 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Simple portable mutex
+ */
+
+
+#pragma once
+
+#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
+    #include <mutex>
+#else
+    #if defined(_WIN32) || defined(_WIN64)
+        #include <intrin.h>
+
+        #define WIN32_LEAN_AND_MEAN
+        #define NOMINMAX
+        #include <windows.h>
+        #undef WIN32_LEAN_AND_MEAN
+        #undef NOMINMAX
+
+        /**
+         * Compiler read/write barrier
+         */
+        #pragma intrinsic(_ReadWriteBarrier)
+
+    #endif
+#endif
+
+#include "../util_namespace.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * Simple portable mutex
+ *   - Wraps std::mutex when compiled with C++11 or newer (supported on all platforms)
+ *   - Uses GNU/Windows spinlock mechanisms for pre C++11 (supported on x86/x64 when compiled with cl.exe or g++)
+ */
+struct Mutex
+{
+#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
+
+    std::mutex mtx;
+
+    void Lock()
+    {
+        mtx.lock();
+    }
+
+    void Unlock()
+    {
+        mtx.unlock();
+    }
+
+    void TryLock()
+    {
+        mtx.try_lock();
+    }
+
+#else       //__cplusplus > 199711L
+
+    #if defined(_MSC_VER)
+
+        // Microsoft VC++
+        typedef long Spinlock;
+
+    #else
+
+        // GNU g++
+        typedef int Spinlock;
+
+        /**
+         * Compiler read/write barrier
+         */
+        __forceinline__ void _ReadWriteBarrier()
+        {
+            __sync_synchronize();
+        }
+
+        /**
+         * Atomic exchange
+         */
+        __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value)
+        {
+            // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier
+            _ReadWriteBarrier();
+            return __sync_lock_test_and_set(Target, Value);
+        }
+
+        /**
+         * Pause instruction to prevent excess processor bus usage
+         */
+        __forceinline__ void YieldProcessor()
+        {
+        }
+
+    #endif  // defined(_MSC_VER)
+
+        /// Lock member
+        volatile Spinlock lock;
+
+        /**
+         * Constructor
+         */
+        Mutex() : lock(0) {}
+
+        /**
+         * Return when the specified spinlock has been acquired
+         */
+        __forceinline__ void Lock()
+        {
+            while (1)
+            {
+                if (!_InterlockedExchange(&lock, 1)) return;
+                while (lock) YieldProcessor();
+            }
+        }
+
+
+        /**
+         * Release the specified spinlock
+         */
+        __forceinline__ void Unlock()
+        {
+            _ReadWriteBarrier();
+            lock = 0;
+        }
+
+#endif      // __cplusplus > 199711L
+
+};
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cpp/nvgraph/external/cub_semiring/iterator/arg_index_input_iterator.cuh b/cpp/nvgraph/external/cub_semiring/iterator/arg_index_input_iterator.cuh
new file mode 100644
index 00000000000..d3bce583d8c
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/iterator/arg_index_input_iterator.cuh
@@ -0,0 +1,259 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+#include "../util_namespace.cuh"
+
+#include <thrust/version.h>
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access input wrapper for pairing dereferenced values with their corresponding indices (forming \p KeyValuePair tuples).
+ *
+ * \par Overview
+ * - ArgIndexInputIteratorTwraps a random access input iterator \p itr of type \p InputIteratorT.
+ *   Dereferencing an ArgIndexInputIteratorTat offset \p i produces a \p KeyValuePair value whose
+ *   \p key field is \p i and whose \p value field is <tt>itr[i]</tt>.
+ * - Can be used with any data type.
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions.  Wrapped host memory can only be dereferenced on the host, and wrapped
+ *   device memory can only be dereferenced on the device.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p ArgIndexInputIteratorTto
+ * dereference an array of doubles
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/arg_index_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * double *d_in;         // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::ArgIndexInputIterator<double*> itr(d_in);
+ *
+ * // Within device code:
+ * typedef typename cub::ArgIndexInputIterator<double*>::value_type Tuple;
+ * Tuple item_offset_pair.key = *itr;
+ * printf("%f @ %d\n",
+ *   item_offset_pair.value,
+ *   item_offset_pair.key);   // 8.0 @ 0
+ *
+ * itr = itr + 6;
+ * item_offset_pair.key = *itr;
+ * printf("%f @ %d\n",
+ *   item_offset_pair.value,
+ *   item_offset_pair.key);   // 9.0 @ 6
+ *
+ * \endcode
+ *
+ * \tparam InputIteratorT       The value type of the wrapped input iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ * \tparam OutputValueT         The paired value type of the <offset,value> tuple (Default: value type of input iterator)
+ */
+template <
+    typename    InputIteratorT,
+    typename    OffsetT             = ptrdiff_t,
+    typename    OutputValueT        = typename std::iterator_traits<InputIteratorT>::value_type>
+class ArgIndexInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef ArgIndexInputIterator                       self_type;              ///< My own type
+    typedef OffsetT                                     difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef KeyValuePair<difference_type, OutputValueT> value_type;             ///< The type of the element the iterator can point to
+    typedef value_type*                                 pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef value_type                                  reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    InputIteratorT  itr;
+    difference_type offset;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ArgIndexInputIterator(
+        InputIteratorT  itr,            ///< Input iterator to wrap
+        difference_type offset = 0)     ///< OffsetT (in items) from \p itr denoting the position of the iterator
+    :
+        itr(itr),
+        offset(offset)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        value_type retval;
+        retval.value = itr[offset];
+        retval.key = offset;
+        return retval;
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(itr, offset + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(itr, offset - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return offset - other.offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        self_type offset = (*this) + n;
+        return *offset;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &(*(*this));
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return ((itr == rhs.itr) && (offset == rhs.offset));
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return ((itr != rhs.itr) || (offset != rhs.offset));
+    }
+
+    /// Normalize
+    __host__ __device__ __forceinline__ void normalize()
+    {
+        itr += offset;
+        offset = 0;
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
+    {
+        return os;
+    }
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cpp/nvgraph/external/cub_semiring/iterator/cache_modified_input_iterator.cuh b/cpp/nvgraph/external/cub_semiring/iterator/cache_modified_input_iterator.cuh
new file mode 100644
index 00000000000..0c0252c8b1a
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/iterator/cache_modified_input_iterator.cuh
@@ -0,0 +1,240 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+#include "../util_namespace.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
+ *
+ * \par Overview
+ * - CacheModifiedInputIteratorTis a random-access input iterator that wraps a native
+ *   device pointer of type <tt>ValueType*</tt>. \p ValueType references are
+ *   made by reading \p ValueType values through loads modified by \p MODIFIER.
+ * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG",
+ *   "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.).
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions, but can only be dereferenced within device functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p CacheModifiedInputIteratorTto
+ * dereference a device array of double using the "ldg" PTX load modifier
+ * (i.e., load values through texture cache).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/cache_modified_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * double *d_in;            // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::CacheModifiedInputIterator<cub::LOAD_LDG, double> itr(d_in);
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);  // 8.0
+ * printf("%f\n", itr[1]);  // 6.0
+ * printf("%f\n", itr[6]);  // 9.0
+ *
+ * \endcode
+ *
+ * \tparam CacheLoadModifier    The cub::CacheLoadModifier to use when accessing data
+ * \tparam ValueType            The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    CacheLoadModifier   MODIFIER,
+    typename            ValueType,
+    typename            OffsetT = ptrdiff_t>
+class CacheModifiedInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef CacheModifiedInputIterator          self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::device_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+
+public:
+
+    /// Wrapped native pointer
+    ValueType* ptr;
+
+    /// Constructor
+    template <typename QualifiedValueType>
+    __host__ __device__ __forceinline__ CacheModifiedInputIterator(
+        QualifiedValueType* ptr)     ///< Native pointer to wrap
+    :
+        ptr(const_cast<typename RemoveQualifiers<QualifiedValueType>::Type *>(ptr))
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        ptr++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        ptr++;
+        return *this;
+    }
+
+    /// Indirection
+    __device__ __forceinline__ reference operator*() const
+    {
+        return ThreadLoad<MODIFIER>(ptr);
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(ptr + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        ptr += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(ptr - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        ptr -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return ptr - other.ptr;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return ThreadLoad<MODIFIER>(ptr + n);
+    }
+
+    /// Structure dereference
+    __device__ __forceinline__ pointer operator->()
+    {
+        return &ThreadLoad<MODIFIER>(ptr);
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (ptr == rhs.ptr);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (ptr != rhs.ptr);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
+    {
+        return os;
+    }
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cpp/nvgraph/external/cub_semiring/iterator/cache_modified_output_iterator.cuh b/cpp/nvgraph/external/cub_semiring/iterator/cache_modified_output_iterator.cuh
new file mode 100644
index 00000000000..8dbaafa61c5
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/iterator/cache_modified_output_iterator.cuh
@@ -0,0 +1,254 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+#include "../util_namespace.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access output wrapper for storing array values using a PTX cache-modifier.
+ *
+ * \par Overview
+ * - CacheModifiedOutputIterator is a random-access output iterator that wraps a native
+ *   device pointer of type <tt>ValueType*</tt>. \p ValueType references are
+ *   made by writing \p ValueType values through stores modified by \p MODIFIER.
+ * - Can be used to store any data type to memory using PTX cache store modifiers (e.g., "STORE_WB",
+ *   "STORE_CG", "STORE_CS", "STORE_WT", etc.).
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions, but can only be dereferenced within device functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p CacheModifiedOutputIterator to
+ * dereference a device array of doubles using the "wt" PTX load modifier
+ * (i.e., write-through to system memory).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/cache_modified_output_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * double *d_out;              // e.g., [, , , , , , ]
+ *
+ * // Create an iterator wrapper
+ * cub::CacheModifiedOutputIterator<cub::STORE_WT, double> itr(d_out);
+ *
+ * // Within device code:
+ * itr[0]  = 8.0;
+ * itr[1]  = 66.0;
+ * itr[55] = 24.0;
+ *
+ * \endcode
+ *
+ * \par Usage Considerations
+ * - Can only be dereferenced within device code
+ *
+ * \tparam CacheStoreModifier     The cub::CacheStoreModifier to use when accessing data
+ * \tparam ValueType            The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    CacheStoreModifier  MODIFIER,
+    typename            ValueType,
+    typename            OffsetT = ptrdiff_t>
+class CacheModifiedOutputIterator
+{
+private:
+
+    // Proxy object
+    struct Reference
+    {
+        ValueType* ptr;
+
+        /// Constructor
+        __host__ __device__ __forceinline__ Reference(ValueType* ptr) : ptr(ptr) {}
+
+        /// Assignment
+        __device__ __forceinline__ ValueType operator =(ValueType val)
+        {
+            ThreadStore<MODIFIER>(ptr, val);
+            return val;
+        }
+    };
+
+public:
+
+    // Required iterator traits
+    typedef CacheModifiedOutputIterator         self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef void                                value_type;             ///< The type of the element the iterator can point to
+    typedef void                                pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef Reference                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::device_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    ValueType* ptr;
+
+public:
+
+    /// Constructor
+    template <typename QualifiedValueType>
+    __host__ __device__ __forceinline__ CacheModifiedOutputIterator(
+        QualifiedValueType* ptr)     ///< Native pointer to wrap
+    :
+        ptr(const_cast<typename RemoveQualifiers<QualifiedValueType>::Type *>(ptr))
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        ptr++;
+        return retval;
+    }
+
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        ptr++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return Reference(ptr);
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(ptr + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        ptr += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(ptr - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        ptr -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return ptr - other.ptr;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return Reference(ptr + n);
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (ptr == rhs.ptr);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (ptr != rhs.ptr);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        return os;
+    }
+};
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cpp/nvgraph/external/cub_semiring/iterator/constant_input_iterator.cuh b/cpp/nvgraph/external/cub_semiring/iterator/constant_input_iterator.cuh
new file mode 100644
index 00000000000..0b7af478d74
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/iterator/constant_input_iterator.cuh
@@ -0,0 +1,235 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_namespace.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access input generator for dereferencing a sequence of homogeneous values
+ *
+ * \par Overview
+ * - Read references to a ConstantInputIteratorTiterator always return the supplied constant
+ *   of type \p ValueType.
+ * - Can be used with any data type.
+ * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
+ *   functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p ConstantInputIteratorTto
+ * dereference a sequence of homogeneous doubles.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/constant_input_iterator.cuh>
+ *
+ * cub::ConstantInputIterator<double> itr(5.0);
+ *
+ * printf("%f\n", itr[0]);      // 5.0
+ * printf("%f\n", itr[1]);      // 5.0
+ * printf("%f\n", itr[2]);      // 5.0
+ * printf("%f\n", itr[50]);     // 5.0
+ *
+ * \endcode
+ *
+ * \tparam ValueType            The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    typename ValueType,
+    typename OffsetT = ptrdiff_t>
+class ConstantInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef ConstantInputIterator               self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    ValueType   val;
+    OffsetT     offset;
+#ifdef _WIN32
+    OffsetT     pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))];        // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
+#endif
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ConstantInputIterator(
+        ValueType   val,            ///< Starting value for the iterator instance to report
+        OffsetT     offset = 0)     ///< Base offset
+    :
+        val(val),
+        offset(offset)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return val;
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(val, offset + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(val, offset - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return offset - other.offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance /*n*/) const
+    {
+        return val;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &val;
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (offset == rhs.offset) && ((val == rhs.val));
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (offset != rhs.offset) || (val!= rhs.val);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        os << "[" << itr.val << "," << itr.offset << "]";
+        return os;
+    }
+
+};
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cpp/nvgraph/external/cub_semiring/iterator/counting_input_iterator.cuh b/cpp/nvgraph/external/cub_semiring/iterator/counting_input_iterator.cuh
new file mode 100644
index 00000000000..3b42a00d181
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/iterator/counting_input_iterator.cuh
@@ -0,0 +1,228 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+#include "../util_namespace.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+/**
+ * \brief A random-access input generator for dereferencing a sequence of incrementing integer values.
+ *
+ * \par Overview
+ * - After initializing a CountingInputIteratorTto a certain integer \p base, read references
+ *   at \p offset will return the value \p base + \p offset.
+ * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
+ *   functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p CountingInputIteratorTto
+ * dereference a sequence of incrementing integers.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/counting_input_iterator.cuh>
+ *
+ * cub::CountingInputIterator<int> itr(5);
+ *
+ * printf("%d\n", itr[0]);      // 5
+ * printf("%d\n", itr[1]);      // 6
+ * printf("%d\n", itr[2]);      // 7
+ * printf("%d\n", itr[50]);     // 55
+ *
+ * \endcode
+ *
+ * \tparam ValueType            The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    typename ValueType,
+    typename OffsetT = ptrdiff_t>
+class CountingInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef CountingInputIterator               self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    ValueType val;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ CountingInputIterator(
+        const ValueType &val)          ///< Starting value for the iterator instance to report
+    :
+        val(val)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        val++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        val++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return val;
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(val + (ValueType) n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        val += (ValueType) n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(val - (ValueType) n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        val -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return (difference_type) (val - other.val);
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return val + (ValueType) n;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &val;
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (val == rhs.val);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (val != rhs.val);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        os << "[" << itr.val << "]";
+        return os;
+    }
+
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cpp/nvgraph/external/cub_semiring/iterator/discard_output_iterator.cuh b/cpp/nvgraph/external/cub_semiring/iterator/discard_output_iterator.cuh
new file mode 100644
index 00000000000..1fca08c062d
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/iterator/discard_output_iterator.cuh
@@ -0,0 +1,220 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../util_namespace.cuh"
+#include "../util_macro.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A discard iterator
+ */
+template <typename OffsetT = ptrdiff_t>
+class DiscardOutputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef DiscardOutputIterator   self_type;              ///< My own type
+    typedef OffsetT                 difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef void                    value_type;             ///< The type of the element the iterator can point to
+    typedef void                    pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef void                    reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    OffsetT offset;
+
+#if defined(_WIN32) || !defined(_WIN64)
+    // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
+    OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))];
+#endif
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ DiscardOutputIterator(
+        OffsetT offset = 0)     ///< Base offset
+    :
+        offset(offset)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ self_type& operator*()
+    {
+        // return self reference, which can be assigned to anything
+        return *this;
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(offset + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(offset - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return offset - other.offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator[](Distance n)
+    {
+        // return self reference, which can be assigned to anything
+        return *this;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return;
+    }
+
+    /// Assignment to self (no-op)
+    __host__ __device__ __forceinline__ void operator=(self_type const& other)
+    {
+        offset = other.offset;
+    }
+
+    /// Assignment to anything else (no-op)
+    template<typename T>
+    __host__ __device__ __forceinline__ void operator=(T const&)
+    {}
+
+    /// Cast to void* operator
+    __host__ __device__ __forceinline__ operator void*() const { return NULL; }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (offset == rhs.offset);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (offset != rhs.offset);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        os << "[" << itr.offset << "]";
+        return os;
+    }
+
+};
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cpp/nvgraph/external/cub_semiring/iterator/tex_obj_input_iterator.cuh b/cpp/nvgraph/external/cub_semiring/iterator/tex_obj_input_iterator.cuh
new file mode 100644
index 00000000000..623609452fd
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/iterator/tex_obj_input_iterator.cuh
@@ -0,0 +1,310 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+#include "../util_debug.cuh"
+#include "../util_namespace.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+
+/**
+ * \brief A random-access input wrapper for dereferencing array values through texture cache.  Uses newer Kepler-style texture objects.
+ *
+ * \par Overview
+ * - TexObjInputIteratorTwraps a native device pointer of type <tt>ValueType*</tt>. References
+ *   to elements are to be loaded through texture cache.
+ * - Can be used to load any data type from memory through texture cache.
+ * - Can be manipulated and exchanged within and between host and device
+ *   functions, can only be constructed within host functions, and can only be
+ *   dereferenced within device functions.
+ * - With regard to nested/dynamic parallelism, TexObjInputIteratorTiterators may only be
+ *   created by the host thread, but can be used by any descendant kernel.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p TexRefInputIteratorTto
+ * dereference a device array of doubles through texture cache.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/tex_obj_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * int num_items;   // e.g., 7
+ * double *d_in;    // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::TexObjInputIterator<double> itr;
+ * itr.BindTexture(d_in, sizeof(double) * num_items);
+ * ...
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);      // 8.0
+ * printf("%f\n", itr[1]);      // 6.0
+ * printf("%f\n", itr[6]);      // 9.0
+ *
+ * ...
+ * itr.UnbindTexture();
+ *
+ * \endcode
+ *
+ * \tparam T                    The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    typename    T,
+    typename    OffsetT = ptrdiff_t>
+class TexObjInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef TexObjInputIterator                 self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef T                                   value_type;             ///< The type of the element the iterator can point to
+    typedef T*                                  pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef T                                   reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::device_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    // Largest texture word we can use in device
+    typedef typename UnitWord<T>::TextureWord TextureWord;
+
+    // Number of texture words per T
+    enum {
+        TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord)
+    };
+
+private:
+
+    T*                  ptr;
+    difference_type     tex_offset;
+    cudaTextureObject_t tex_obj;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ TexObjInputIterator()
+    :
+        ptr(NULL),
+        tex_offset(0),
+        tex_obj(0)
+    {}
+
+    /// Use this iterator to bind \p ptr with a texture reference
+    template <typename QualifiedT>
+    cudaError_t BindTexture(
+        QualifiedT      *ptr,               ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
+        size_t          bytes = size_t(-1),         ///< Number of bytes in the range
+        size_t          tex_offset = 0)     ///< OffsetT (in items) from \p ptr denoting the position of the iterator
+    {
+        this->ptr = const_cast<typename RemoveQualifiers<QualifiedT>::Type *>(ptr);
+        this->tex_offset = tex_offset;
+
+        cudaChannelFormatDesc   channel_desc = cudaCreateChannelDesc<TextureWord>();
+        cudaResourceDesc        res_desc;
+        cudaTextureDesc         tex_desc;
+        memset(&res_desc, 0, sizeof(cudaResourceDesc));
+        memset(&tex_desc, 0, sizeof(cudaTextureDesc));
+        res_desc.resType                = cudaResourceTypeLinear;
+        res_desc.res.linear.devPtr      = this->ptr;
+        res_desc.res.linear.desc        = channel_desc;
+        res_desc.res.linear.sizeInBytes = bytes;
+        tex_desc.readMode               = cudaReadModeElementType;
+        return cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL);
+    }
+
+    /// Unbind this iterator from its texture reference
+    cudaError_t UnbindTexture()
+    {
+        return cudaDestroyTextureObject(tex_obj);
+    }
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        tex_offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        tex_offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+#if (CUB_PTX_ARCH == 0)
+        // Simply dereference the pointer on the host
+        return ptr[tex_offset];
+#else
+        // Move array of uninitialized words, then alias and assign to return value
+        TextureWord words[TEXTURE_MULTIPLE];
+
+        #pragma unroll
+        for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
+        {
+            words[i] = tex1Dfetch<TextureWord>(
+                tex_obj,
+                (tex_offset * TEXTURE_MULTIPLE) + i);
+        }
+
+        // Load from words
+        return *reinterpret_cast<T*>(words);
+#endif
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval;
+        retval.ptr          = ptr;
+        retval.tex_obj      = tex_obj;
+        retval.tex_offset   = tex_offset + n;
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        tex_offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval;
+        retval.ptr          = ptr;
+        retval.tex_obj      = tex_obj;
+        retval.tex_offset   = tex_offset - n;
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        tex_offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return tex_offset - other.tex_offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        self_type offset = (*this) + n;
+        return *offset;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &(*(*this));
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset) && (tex_obj == rhs.tex_obj));
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset) || (tex_obj != rhs.tex_obj));
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        return os;
+    }
+
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cpp/nvgraph/external/cub_semiring/iterator/tex_ref_input_iterator.cuh b/cpp/nvgraph/external/cub_semiring/iterator/tex_ref_input_iterator.cuh
new file mode 100644
index 00000000000..0305c78cd2c
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/iterator/tex_ref_input_iterator.cuh
@@ -0,0 +1,374 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+#include "../util_debug.cuh"
+#include "../util_namespace.cuh"
+
+#if (CUDA_VERSION >= 5050) || defined(DOXYGEN_ACTIVE)  // This iterator is compatible with CUDA 5.5 and newer
+
+#if (THRUST_VERSION >= 100700)    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Static file-scope Tesla/Fermi-style texture references
+ *****************************************************************************/
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+// Anonymous namespace
+namespace {
+
+/// Global texture reference specialized by type
+template <typename T>
+struct IteratorTexRef
+{
+    /// And by unique ID
+    template <int UNIQUE_ID>
+    struct TexId
+    {
+        // Largest texture word we can use in device
+        typedef typename UnitWord<T>::DeviceWord DeviceWord;
+        typedef typename UnitWord<T>::TextureWord TextureWord;
+
+        // Number of texture words per T
+        enum {
+            DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord),
+            TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord)
+        };
+
+        // Texture reference type
+        typedef texture<TextureWord> TexRef;
+
+        // Texture reference
+        static TexRef ref;
+
+        /// Bind texture
+        static cudaError_t BindTexture(void *d_in, size_t &offset)
+        {
+            if (d_in)
+            {
+                cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc<TextureWord>();
+                ref.channelDesc = tex_desc;
+                return (CubDebug(cudaBindTexture(&offset, ref, d_in)));
+            }
+
+            return cudaSuccess;
+        }
+
+        /// Unbind texture
+        static cudaError_t UnbindTexture()
+        {
+            return CubDebug(cudaUnbindTexture(ref));
+        }
+
+        /// Fetch element
+        template <typename Distance>
+        static __device__ __forceinline__ T Fetch(Distance tex_offset)
+        {
+            DeviceWord temp[DEVICE_MULTIPLE];
+            TextureWord *words = reinterpret_cast<TextureWord*>(temp);
+
+            #pragma unroll
+            for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
+            {
+                words[i] = tex1Dfetch(ref, (tex_offset * TEXTURE_MULTIPLE) + i);
+            }
+
+            return reinterpret_cast<T&>(temp);
+        }
+    };
+};
+
+// Texture reference definitions
+template <typename  T>
+template <int       UNIQUE_ID>
+typename IteratorTexRef<T>::template TexId<UNIQUE_ID>::TexRef IteratorTexRef<T>::template TexId<UNIQUE_ID>::ref = 0;
+
+
+} // Anonymous namespace
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+
+/**
+ * \brief A random-access input wrapper for dereferencing array values through texture cache.  Uses older Tesla/Fermi-style texture references.
+ *
+ * \par Overview
+ * - TexRefInputIteratorTwraps a native device pointer of type <tt>ValueType*</tt>. References
+ *   to elements are to be loaded through texture cache.
+ * - Can be used to load any data type from memory through texture cache.
+ * - Can be manipulated and exchanged within and between host and device
+ *   functions, can only be constructed within host functions, and can only be
+ *   dereferenced within device functions.
+ * - The \p UNIQUE_ID template parameter is used to statically name the underlying texture
+ *   reference.  Only one TexRefInputIteratorTinstance can be bound at any given time for a
+ *   specific combination of (1) data type \p T, (2) \p UNIQUE_ID, (3) host
+ *   thread, and (4) compilation .o unit.
+ * - With regard to nested/dynamic parallelism, TexRefInputIteratorTiterators may only be
+ *   created by the host thread and used by a top-level kernel (i.e. the one which is launched
+ *   from the host).
+ * - Compatible with Thrust API v1.7 or newer.
+ * - Compatible with CUDA toolkit v5.5 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p TexRefInputIteratorTto
+ * dereference a device array of doubles through texture cache.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/tex_ref_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * int num_items;   // e.g., 7
+ * double *d_in;    // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::TexRefInputIterator<double, __LINE__> itr;
+ * itr.BindTexture(d_in, sizeof(double) * num_items);
+ * ...
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);      // 8.0
+ * printf("%f\n", itr[1]);      // 6.0
+ * printf("%f\n", itr[6]);      // 9.0
+ *
+ * ...
+ * itr.UnbindTexture();
+ *
+ * \endcode
+ *
+ * \tparam T                    The value type of this iterator
+ * \tparam UNIQUE_ID            A globally-unique identifier (within the compilation unit) to name the underlying texture reference
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    typename    T,
+    int         UNIQUE_ID,
+    typename    OffsetT = ptrdiff_t>
+class TexRefInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef TexRefInputIterator                 self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef T                                   value_type;             ///< The type of the element the iterator can point to
+    typedef T*                                  pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef T                                   reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::device_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    T*              ptr;
+    difference_type tex_offset;
+
+    // Texture reference wrapper (old Tesla/Fermi-style textures)
+    typedef typename IteratorTexRef<T>::template TexId<UNIQUE_ID> TexId;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ TexRefInputIterator()
+    :
+        ptr(NULL),
+        tex_offset(0)
+    {}
+
+    /// Use this iterator to bind \p ptr with a texture reference
+    template <typename QualifiedT>
+    cudaError_t BindTexture(
+        QualifiedT      *ptr,                   ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
+        size_t          bytes = size_t(-1),     ///< Number of bytes in the range
+        size_t          tex_offset = 0)         ///< OffsetT (in items) from \p ptr denoting the position of the iterator
+    {
+        this->ptr = const_cast<typename RemoveQualifiers<QualifiedT>::Type *>(ptr);
+        size_t offset;
+        cudaError_t retval = TexId::BindTexture(this->ptr + tex_offset, offset);
+        this->tex_offset = (difference_type) (offset / sizeof(QualifiedT));
+        return retval;
+    }
+
+    /// Unbind this iterator from its texture reference
+    cudaError_t UnbindTexture()
+    {
+        return TexId::UnbindTexture();
+    }
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        tex_offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        tex_offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+#if (CUB_PTX_ARCH == 0)
+        // Simply dereference the pointer on the host
+        return ptr[tex_offset];
+#else
+        // Use the texture reference
+        return TexId::Fetch(tex_offset);
+#endif
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval;
+        retval.ptr = ptr;
+        retval.tex_offset = tex_offset + n;
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        tex_offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval;
+        retval.ptr = ptr;
+        retval.tex_offset = tex_offset - n;
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        tex_offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return tex_offset - other.tex_offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        self_type offset = (*this) + n;
+        return *offset;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &(*(*this));
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset));
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset));
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        return os;
+    }
+
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+#endif // CUDA_VERSION
diff --git a/cpp/nvgraph/external/cub_semiring/iterator/transform_input_iterator.cuh b/cpp/nvgraph/external/cub_semiring/iterator/transform_input_iterator.cuh
new file mode 100644
index 00000000000..39258a40c9b
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/iterator/transform_input_iterator.cuh
@@ -0,0 +1,252 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+#include "../util_namespace.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access input wrapper for transforming dereferenced values.
+ *
+ * \par Overview
+ * - TransformInputIteratorTwraps a unary conversion functor of type \p
+ *   ConversionOp and a random-access input iterator of type <tt>InputIteratorT</tt>,
+ *   using the former to produce references of type \p ValueType from the latter.
+ * - Can be used with any data type.
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions.  Wrapped host memory can only be dereferenced on the host, and wrapped
+ *   device memory can only be dereferenced on the device.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p TransformInputIteratorTto
+ * dereference an array of integers, tripling the values and converting them to doubles.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/transform_input_iterator.cuh>
+ *
+ * // Functor for tripling integer values and converting to doubles
+ * struct TripleDoubler
+ * {
+ *     __host__ __device__ __forceinline__
+ *     double operator()(const int &a) const {
+ *         return double(a * 3);
+ *     }
+ * };
+ *
+ * // Declare, allocate, and initialize a device array
+ * int *d_in;                   // e.g., [8, 6, 7, 5, 3, 0, 9]
+ * TripleDoubler conversion_op;
+ *
+ * // Create an iterator wrapper
+ * cub::TransformInputIterator<double, TripleDoubler, int*> itr(d_in, conversion_op);
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);  // 24.0
+ * printf("%f\n", itr[1]);  // 18.0
+ * printf("%f\n", itr[6]);  // 27.0
+ *
+ * \endcode
+ *
+ * \tparam ValueType            The value type of this iterator
+ * \tparam ConversionOp         Unary functor type for mapping objects of type \p InputType to type \p ValueType.  Must have member <tt>ValueType operator()(const InputType &datum)</tt>.
+ * \tparam InputIteratorT       The type of the wrapped input iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ *
+ */
+template <
+    typename ValueType,
+    typename ConversionOp,
+    typename InputIteratorT,
+    typename OffsetT = ptrdiff_t>
+class TransformInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef TransformInputIterator              self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    ConversionOp    conversion_op;
+    InputIteratorT  input_itr;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ TransformInputIterator(
+        InputIteratorT      input_itr,          ///< Input iterator to wrap
+        ConversionOp        conversion_op)      ///< Conversion functor to wrap
+    :
+        conversion_op(conversion_op),
+        input_itr(input_itr)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        input_itr++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        input_itr++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return conversion_op(*input_itr);
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(input_itr + n, conversion_op);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        input_itr += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(input_itr - n, conversion_op);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        input_itr -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return input_itr - other.input_itr;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return conversion_op(input_itr[n]);
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &conversion_op(*input_itr);
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (input_itr == rhs.input_itr);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (input_itr != rhs.input_itr);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        return os;
+    }
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cpp/nvgraph/external/cub_semiring/thread/thread_load.cuh b/cpp/nvgraph/external/cub_semiring/thread/thread_load.cuh
new file mode 100644
index 00000000000..9de4bd4149b
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/thread/thread_load.cuh
@@ -0,0 +1,438 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for reading memory using PTX cache modifiers.
+ */
+
+#pragma once
+
+#include <cuda.h>
+
+#include <iterator>
+
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIo
+ * @{
+ */
+
+//-----------------------------------------------------------------------------
+// Tags and constants
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief Enumeration of cache modifiers for memory load operations.
+ */
+enum CacheLoadModifier
+{
+    LOAD_DEFAULT,       ///< Default (no modifier)
+    LOAD_CA,            ///< Cache at all levels
+    LOAD_CG,            ///< Cache at global level
+    LOAD_CS,            ///< Cache streaming (likely to be accessed once)
+    LOAD_CV,            ///< Cache as volatile (including cached system lines)
+    LOAD_LDG,           ///< Cache as texture
+    LOAD_VOLATILE,      ///< Volatile (any memory space)
+};
+
+
+/**
+ * \name Thread I/O (cache modified)
+ * @{
+ */
+
+/**
+ * \brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers.  Can be used to load any data type.
+ *
+ * \par Example
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/thread/thread_load.cuh>
+ *
+ * // 32-bit load using cache-global modifier:
+ * int *d_in;
+ * int val = cub::ThreadLoad<cub::LOAD_CA>(d_in + threadIdx.x);
+ *
+ * // 16-bit load using default modifier
+ * short *d_in;
+ * short val = cub::ThreadLoad<cub::LOAD_DEFAULT>(d_in + threadIdx.x);
+ *
+ * // 256-bit load using cache-volatile modifier
+ * double4 *d_in;
+ * double4 val = cub::ThreadLoad<cub::LOAD_CV>(d_in + threadIdx.x);
+ *
+ * // 96-bit load using cache-streaming modifier
+ * struct TestFoo { bool a; short b; };
+ * TestFoo *d_struct;
+ * TestFoo val = cub::ThreadLoad<cub::LOAD_CS>(d_in + threadIdx.x);
+ * \endcode
+ *
+ * \tparam MODIFIER             <b>[inferred]</b> CacheLoadModifier enumeration
+ * \tparam InputIteratorT       <b>[inferred]</b> Input iterator type \iterator
+ */
+template <
+    CacheLoadModifier MODIFIER,
+    typename InputIteratorT>
+__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr);
+
+
+//@}  end member group
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/// Helper structure for templated load iteration (inductive case)
+template <int COUNT, int MAX>
+struct IterateThreadLoad
+{
+    template <CacheLoadModifier MODIFIER, typename T>
+    static __device__ __forceinline__ void Load(T const *ptr, T *vals)
+    {
+        vals[COUNT] = ThreadLoad<MODIFIER>(ptr + COUNT);
+        IterateThreadLoad<COUNT + 1, MAX>::template Load<MODIFIER>(ptr, vals);
+    }
+
+    template <typename InputIteratorT, typename T>
+    static __device__ __forceinline__ void Dereference(InputIteratorT itr, T *vals)
+    {
+        vals[COUNT] = itr[COUNT];
+        IterateThreadLoad<COUNT + 1, MAX>::Dereference(itr, vals);
+    }
+};
+
+
+/// Helper structure for templated load iteration (termination case)
+template <int MAX>
+struct IterateThreadLoad<MAX, MAX>
+{
+    template <CacheLoadModifier MODIFIER, typename T>
+    static __device__ __forceinline__ void Load(T const * /*ptr*/, T * /*vals*/) {}
+
+    template <typename InputIteratorT, typename T>
+    static __device__ __forceinline__ void Dereference(InputIteratorT /*itr*/, T * /*vals*/) {}
+};
+
+
+/**
+ * Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_16(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ uint4 ThreadLoad<cub_modifier, uint4 const *>(uint4 const *ptr)                   \
+    {                                                                                       \
+        uint4 retval;                                                                       \
+        asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" :                 \
+            "=r"(retval.x),                                                                 \
+            "=r"(retval.y),                                                                 \
+            "=r"(retval.z),                                                                 \
+            "=r"(retval.w) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ ulonglong2 ThreadLoad<cub_modifier, ulonglong2 const *>(ulonglong2 const *ptr)    \
+    {                                                                                       \
+        ulonglong2 retval;                                                                  \
+        asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" :                         \
+            "=l"(retval.x),                                                                 \
+            "=l"(retval.y) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+/**
+ * Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_8(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ ushort4 ThreadLoad<cub_modifier, ushort4 const *>(ushort4 const *ptr)             \
+    {                                                                                       \
+        ushort4 retval;                                                                     \
+        asm volatile ("ld."#ptx_modifier".v4.u16 {%0, %1, %2, %3}, [%4];" :                 \
+            "=h"(retval.x),                                                                 \
+            "=h"(retval.y),                                                                 \
+            "=h"(retval.z),                                                                 \
+            "=h"(retval.w) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ uint2 ThreadLoad<cub_modifier, uint2 const *>(uint2 const *ptr)                   \
+    {                                                                                       \
+        uint2 retval;                                                                       \
+        asm volatile ("ld."#ptx_modifier".v2.u32 {%0, %1}, [%2];" :                         \
+            "=r"(retval.x),                                                                 \
+            "=r"(retval.y) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ unsigned long long ThreadLoad<cub_modifier, unsigned long long const *>(unsigned long long const *ptr)    \
+    {                                                                                       \
+        unsigned long long retval;                                                          \
+        asm volatile ("ld."#ptx_modifier".u64 %0, [%1];" :                                  \
+            "=l"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+/**
+ * Define a uint (4B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_4(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ unsigned int ThreadLoad<cub_modifier, unsigned int const *>(unsigned int const *ptr)                      \
+    {                                                                                       \
+        unsigned int retval;                                                                \
+        asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" :                                  \
+            "=r"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+
+/**
+ * Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_2(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ unsigned short ThreadLoad<cub_modifier, unsigned short const *>(unsigned short const *ptr)                \
+    {                                                                                       \
+        unsigned short retval;                                                              \
+        asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" :                                  \
+            "=h"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+
+/**
+ * Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_1(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ unsigned char ThreadLoad<cub_modifier, unsigned char const *>(unsigned char const *ptr)                   \
+    {                                                                                       \
+        unsigned short retval;                                                              \
+        asm volatile (                                                                      \
+        "{"                                                                                 \
+        "   .reg .u8 datum;"                                                                \
+        "    ld."#ptx_modifier".u8 datum, [%1];"                                            \
+        "    cvt.u16.u8 %0, datum;"                                                         \
+        "}" :                                                                               \
+            "=h"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return (unsigned char) retval;                                                      \
+    }
+
+
+/**
+ * Define powers-of-two ThreadLoad specializations for the given Cache load modifier
+ */
+#define _CUB_LOAD_ALL(cub_modifier, ptx_modifier)                                            \
+    _CUB_LOAD_16(cub_modifier, ptx_modifier)                                                 \
+    _CUB_LOAD_8(cub_modifier, ptx_modifier)                                                  \
+    _CUB_LOAD_4(cub_modifier, ptx_modifier)                                                  \
+    _CUB_LOAD_2(cub_modifier, ptx_modifier)                                                  \
+    _CUB_LOAD_1(cub_modifier, ptx_modifier)                                                  \
+
+
+/**
+ * Define powers-of-two ThreadLoad specializations for the various Cache load modifiers
+ */
+#if CUB_PTX_ARCH >= 200
+    _CUB_LOAD_ALL(LOAD_CA, ca)
+    _CUB_LOAD_ALL(LOAD_CG, cg)
+    _CUB_LOAD_ALL(LOAD_CS, cs)
+    _CUB_LOAD_ALL(LOAD_CV, cv)
+#else
+    _CUB_LOAD_ALL(LOAD_CA, global)
+    // Use volatile to ensure coherent reads when this PTX is JIT'd to run on newer architectures with L1
+    _CUB_LOAD_ALL(LOAD_CG, volatile.global)
+    _CUB_LOAD_ALL(LOAD_CS, global)
+    _CUB_LOAD_ALL(LOAD_CV, volatile.global)
+#endif
+
+#if CUB_PTX_ARCH >= 350
+    _CUB_LOAD_ALL(LOAD_LDG, global.nc)
+#else
+    _CUB_LOAD_ALL(LOAD_LDG, global)
+#endif
+
+
+// Macro cleanup
+#undef _CUB_LOAD_ALL
+#undef _CUB_LOAD_1
+#undef _CUB_LOAD_2
+#undef _CUB_LOAD_4
+#undef _CUB_LOAD_8
+#undef _CUB_LOAD_16
+
+
+
+/**
+ * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types
+ */
+template <typename InputIteratorT>
+__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(
+    InputIteratorT          itr,
+    Int2Type<LOAD_DEFAULT>  /*modifier*/,
+    Int2Type<false>         /*is_pointer*/)
+{
+    return *itr;
+}
+
+
+/**
+ * ThreadLoad definition for LOAD_DEFAULT modifier on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoad(
+    T                       *ptr,
+    Int2Type<LOAD_DEFAULT>  /*modifier*/,
+    Int2Type<true>          /*is_pointer*/)
+{
+    return *ptr;
+}
+
+
+/**
+ * ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoadVolatilePointer(
+    T                       *ptr,
+    Int2Type<true>          /*is_primitive*/)
+{
+    T retval = *reinterpret_cast<volatile T*>(ptr);
+    return retval;
+}
+
+
+/**
+ * ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoadVolatilePointer(
+    T                       *ptr,
+    Int2Type<false>         /*is_primitive*/)
+{
+    typedef typename UnitWord<T>::VolatileWord VolatileWord;   // Word type for memcopying
+
+    const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
+/*
+    VolatileWord words[VOLATILE_MULTIPLE];
+
+    IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference(
+        reinterpret_cast<volatile VolatileWord*>(ptr),
+        words);
+
+    return *reinterpret_cast<T*>(words);
+*/
+
+    T retval;
+    VolatileWord *words = reinterpret_cast<VolatileWord*>(&retval);
+    IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference(
+        reinterpret_cast<volatile VolatileWord*>(ptr),
+        words);
+    return retval;
+}
+
+
+/**
+ * ThreadLoad definition for LOAD_VOLATILE modifier on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoad(
+    T                       *ptr,
+    Int2Type<LOAD_VOLATILE> /*modifier*/,
+    Int2Type<true>          /*is_pointer*/)
+{
+    // Apply tags for partial-specialization
+    return ThreadLoadVolatilePointer(ptr, Int2Type<Traits<T>::PRIMITIVE>());
+}
+
+
+/**
+ * ThreadLoad definition for generic modifiers on pointer types
+ */
+template <typename T, int MODIFIER>
+__device__ __forceinline__ T ThreadLoad(
+    T const                 *ptr,
+    Int2Type<MODIFIER>      /*modifier*/,
+    Int2Type<true>          /*is_pointer*/)
+{
+    typedef typename UnitWord<T>::DeviceWord DeviceWord;
+
+    const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);
+
+    DeviceWord words[DEVICE_MULTIPLE];
+
+    IterateThreadLoad<0, DEVICE_MULTIPLE>::template Load<CacheLoadModifier(MODIFIER)>(
+        reinterpret_cast<DeviceWord*>(const_cast<T*>(ptr)),
+        words);
+
+    return *reinterpret_cast<T*>(words);
+}
+
+
+/**
+ * ThreadLoad definition for generic modifiers
+ */
+template <
+    CacheLoadModifier MODIFIER,
+    typename InputIteratorT>
+__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr)
+{
+    // Apply tags for partial-specialization
+    return ThreadLoad(
+        itr,
+        Int2Type<MODIFIER>(),
+        Int2Type<IsPointer<InputIteratorT>::VALUE>());
+}
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group UtilIo
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cpp/nvgraph/external/cub_semiring/thread/thread_operators.cuh b/cpp/nvgraph/external/cub_semiring/thread/thread_operators.cuh
new file mode 100644
index 00000000000..2bd5403e864
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/thread/thread_operators.cuh
@@ -0,0 +1,317 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Simple binary operator functor types
+ */
+
+/******************************************************************************
+ * Simple functor operators
+ ******************************************************************************/
+
+#pragma once
+
+#include "../util_macro.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+/**
+ * \brief Default equality functor
+ */
+struct Equality
+{
+    /// Boolean equality operator, returns <tt>(a == b)</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
+    {
+        return a == b;
+    }
+};
+
+
+/**
+ * \brief Default inequality functor
+ */
+struct Inequality
+{
+    /// Boolean inequality operator, returns <tt>(a != b)</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
+    {
+        return a != b;
+    }
+};
+
+
+/**
+ * \brief Inequality functor (wraps equality functor)
+ */
+template <typename EqualityOp>
+struct InequalityWrapper
+{
+    /// Wrapped equality operator
+    EqualityOp op;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    InequalityWrapper(EqualityOp op) : op(op) {}
+
+    /// Boolean inequality operator, returns <tt>(a != b)</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b)
+    {
+        return !op(a, b);
+    }
+};
+
+
+/**
+ * \brief Default sum functor
+ */
+struct Sum
+{
+    /// Boolean sum operator, returns <tt>a + b</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return a + b;
+    }
+};
+
+
+/**
+ * \brief Default max functor
+ */
+struct Max
+{
+    /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return CUB_MAX(a, b);
+    }
+};
+
+
+/**
+ * \brief Arg max functor (keeps the value and offset of the first occurrence of the larger item)
+ */
+struct ArgMax
+{
+    /// Boolean max operator, preferring the item having the smaller offset in case of ties
+    template <typename T, typename OffsetT>
+    __host__ __device__ __forceinline__ KeyValuePair<OffsetT, T> operator()(
+        const KeyValuePair<OffsetT, T> &a,
+        const KeyValuePair<OffsetT, T> &b) const
+    {
+// Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
+//        return ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a;
+
+        if ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key)))
+            return b;
+        return a;
+    }
+};
+
+
+/**
+ * \brief Default min functor
+ */
+struct Min
+{
+    /// Boolean min operator, returns <tt>(a < b) ? a : b</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return CUB_MIN(a, b);
+    }
+};
+
+
+/**
+ * \brief Arg min functor (keeps the value and offset of the first occurrence of the smallest item)
+ */
+struct ArgMin
+{
+    /// Boolean min operator, preferring the item having the smaller offset in case of ties
+    template <typename T, typename OffsetT>
+    __host__ __device__ __forceinline__ KeyValuePair<OffsetT, T> operator()(
+        const KeyValuePair<OffsetT, T> &a,
+        const KeyValuePair<OffsetT, T> &b) const
+    {
+// Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
+//        return ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a;
+
+        if ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key)))
+            return b;
+        return a;
+    }
+};
+
+
+/**
+ * \brief Default cast functor
+ */
+template <typename B>
+struct CastOp
+{
+    /// Cast operator, returns <tt>(B) a</tt>
+    template <typename A>
+    __host__ __device__ __forceinline__ B operator()(const A &a) const
+    {
+        return (B) a;
+    }
+};
+
+
+/**
+ * \brief Binary operator wrapper for switching non-commutative scan arguments
+ */
+template <typename ScanOp>
+class SwizzleScanOp
+{
+private:
+
+    /// Wrapped scan operator
+    ScanOp scan_op;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    SwizzleScanOp(ScanOp scan_op) : scan_op(scan_op) {}
+
+    /// Switch the scan arguments
+    template <typename T>
+    __host__ __device__ __forceinline__
+    T operator()(const T &a, const T &b)
+    {
+      T _a(a);
+      T _b(b);
+
+      return scan_op(_b, _a);
+    }
+};
+
+
+/**
+ * \brief Reduce-by-segment functor.
+ *
+ * Given two cub::KeyValuePair inputs \p a and \p b and a
+ * binary associative combining operator \p <tt>f(const T &x, const T &y)</tt>,
+ * an instance of this functor returns a cub::KeyValuePair whose \p key
+ * field is <tt>a.key</tt> + <tt>b.key</tt>, and whose \p value field
+ * is either b.value if b.key is non-zero, or f(a.value, b.value) otherwise.
+ *
+ * ReduceBySegmentOp is an associative, non-commutative binary combining operator
+ * for input sequences of cub::KeyValuePair pairings.  Such
+ * sequences are typically used to represent a segmented set of values to be reduced
+ * and a corresponding set of {0,1}-valued integer "head flags" demarcating the
+ * first value of each segment.
+ *
+ */
+template <typename ReductionOpT>    ///< Binary reduction operator to apply to values
+struct ReduceBySegmentOp
+{
+    /// Wrapped reduction operator
+    ReductionOpT op;
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ReduceBySegmentOp() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ReduceBySegmentOp(ReductionOpT op) : op(op) {}
+
+    /// Scan operator
+    template <typename KeyValuePairT>       ///< KeyValuePair pairing of T (value) and OffsetT (head flag)
+    __host__ __device__ __forceinline__ KeyValuePairT operator()(
+        const KeyValuePairT &first,         ///< First partial reduction
+        const KeyValuePairT &second)        ///< Second partial reduction
+    {
+        KeyValuePairT retval;
+        retval.key = first.key + second.key;
+        retval.value = (second.key) ?
+                second.value :                          // The second partial reduction spans a segment reset, so it's value aggregate becomes the running aggregate
+                op(first.value, second.value);          // The second partial reduction does not span a reset, so accumulate both into the running aggregate
+        return retval;
+    }
+};
+
+
+
+template <typename ReductionOpT>    ///< Binary reduction operator to apply to values
+struct ReduceByKeyOp
+{
+    /// Wrapped reduction operator
+    ReductionOpT op;
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ReduceByKeyOp() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ReduceByKeyOp(ReductionOpT op) : op(op) {}
+
+    /// Scan operator
+    template <typename KeyValuePairT>
+    __host__ __device__ __forceinline__ KeyValuePairT operator()(
+        const KeyValuePairT &first,       ///< First partial reduction
+        const KeyValuePairT &second)      ///< Second partial reduction
+    {
+        KeyValuePairT retval = second;
+
+        if (first.key == second.key)
+            retval.value = op(first.value, retval.value);
+
+        return retval;
+    }
+};
+
+
+
+
+
+
+
+/** @} */       // end group UtilModule
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cpp/nvgraph/external/cub_semiring/thread/thread_reduce.cuh b/cpp/nvgraph/external/cub_semiring/thread/thread_reduce.cuh
new file mode 100644
index 00000000000..9e277050236
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/thread/thread_reduce.cuh
@@ -0,0 +1,152 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for sequential reduction over statically-sized array types
+ */
+
+#pragma once
+
+#include "../thread/thread_operators.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
+namespace internal {
+
+/**
+ * Sequential reduction over statically-sized array types
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T*                  input,                  ///< [in] Input array
+    ReductionOp         reduction_op,           ///< [in] Binary reduction operator
+    T                   prefix,                 ///< [in] Prefix to seed reduction with
+    Int2Type<LENGTH>    /*length*/)
+{
+    T retval = prefix;
+
+    #pragma unroll
+    for (int i = 0; i < LENGTH; ++i)
+        retval = reduction_op(retval, input[i]);
+
+    return retval;
+}
+
+
+/**
+ * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T*          input,                  ///< [in] Input array
+    ReductionOp reduction_op,           ///< [in] Binary reduction operator
+    T           prefix)                 ///< [in] Prefix to seed reduction with
+{
+    return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
+}
+
+
+/**
+ * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T*          input,                  ///< [in] Input array
+    ReductionOp reduction_op)           ///< [in] Binary reduction operator
+{
+    T prefix = input[0];
+    return ThreadReduce<LENGTH - 1>(input + 1, reduction_op, prefix);
+}
+
+
+/**
+ * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    ReductionOp reduction_op,           ///< [in] Binary reduction operator
+    T           prefix)                 ///< [in] Prefix to seed reduction with
+{
+    return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
+}
+
+
+/**
+ * \brief Serial reduction with the specified operator
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    ReductionOp reduction_op)           ///< [in] Binary reduction operator
+{
+    return ThreadReduce<LENGTH>((T*) input, reduction_op);
+}
+
+
+}               // internal namespace
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cpp/nvgraph/external/cub_semiring/thread/thread_scan.cuh b/cpp/nvgraph/external/cub_semiring/thread/thread_scan.cuh
new file mode 100644
index 00000000000..545b4141918
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/thread/thread_scan.cuh
@@ -0,0 +1,268 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for sequential prefix scan over statically-sized array types
+ */
+
+#pragma once
+
+#include "../thread/thread_operators.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
+namespace internal {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+/**
+ * \name Sequential prefix scan over statically-sized array types
+ * @{
+ */
+
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanExclusive(
+    T                   inclusive,
+    T                   exclusive,
+    T                   *input,                 ///< [in] Input array
+    T                   *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp              scan_op,                ///< [in] Binary scan operator
+    Int2Type<LENGTH>    /*length*/)
+{
+    #pragma unroll
+    for (int i = 0; i < LENGTH; ++i)
+    {
+        inclusive = scan_op(exclusive, input[i]);
+        output[i] = exclusive;
+        exclusive = inclusive;
+    }
+
+    return inclusive;
+}
+
+
+
+/**
+ * \brief Perform a sequential exclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanExclusive(
+    T           *input,                 ///< [in] Input array
+    T           *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  If not, the first output element is undefined.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    T inclusive = input[0];
+    if (apply_prefix)
+    {
+        inclusive = scan_op(prefix, inclusive);
+    }
+    output[0] = prefix;
+    T exclusive = inclusive;
+
+    return ThreadScanExclusive(inclusive, exclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
+}
+
+
+/**
+ * \brief Perform a sequential exclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanExclusive(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    return ThreadScanExclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
+}
+
+
+
+
+
+
+
+
+
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T                   inclusive,
+    T                   *input,                 ///< [in] Input array
+    T                   *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp              scan_op,                ///< [in] Binary scan operator
+    Int2Type<LENGTH>    /*length*/)
+{
+    #pragma unroll
+    for (int i = 0; i < LENGTH; ++i)
+    {
+        inclusive = scan_op(inclusive, input[i]);
+        output[i] = inclusive;
+    }
+
+    return inclusive;
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           *input,                 ///< [in] Input array
+    T           *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op)                ///< [in] Binary scan operator
+{
+    T inclusive = input[0];
+    output[0] = inclusive;
+
+    // Continue scan
+    return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op)                ///< [in] Binary scan operator
+{
+    return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op);
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           *input,                 ///< [in] Input array
+    T           *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    T inclusive = input[0];
+    if (apply_prefix)
+    {
+        inclusive = scan_op(prefix, inclusive);
+    }
+    output[0] = inclusive;
+
+    // Continue scan
+    return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
+}
+
+
+//@}  end member group
+
+/** @} */       // end group UtilModule
+
+
+}               // internal namespace
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cpp/nvgraph/external/cub_semiring/thread/thread_search.cuh b/cpp/nvgraph/external/cub_semiring/thread/thread_search.cuh
new file mode 100644
index 00000000000..379a08a51e7
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/thread/thread_search.cuh
@@ -0,0 +1,154 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for sequential search
+ */
+
+#pragma once
+
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * Computes the begin offsets into A and B for the specific diagonal
+ */
+template <
+    typename AIteratorT,
+    typename BIteratorT,
+    typename OffsetT,
+    typename CoordinateT>
+__host__ __device__ __forceinline__ void MergePathSearch(
+    OffsetT         diagonal,
+    AIteratorT      a,
+    BIteratorT      b,
+    OffsetT         a_len,
+    OffsetT         b_len,
+    CoordinateT&    path_coordinate)
+{
+    /// The value type of the input iterator
+    typedef typename std::iterator_traits<AIteratorT>::value_type T;
+
+    OffsetT split_min = CUB_MAX(diagonal - b_len, 0);
+    OffsetT split_max = CUB_MIN(diagonal, a_len);
+
+    while (split_min < split_max)
+    {
+        OffsetT split_pivot = (split_min + split_max) >> 1;
+        if (a[split_pivot] <= b[diagonal - split_pivot - 1])
+        {
+            // Move candidate split range up A, down B
+            split_min = split_pivot + 1;
+        }
+        else
+        {
+            // Move candidate split range up B, down A
+            split_max = split_pivot;
+        }
+    }
+
+    path_coordinate.x = CUB_MIN(split_min, a_len);
+    path_coordinate.y = diagonal - split_min;
+}
+
+
+
+/**
+ * \brief Returns the offset of the first value within \p input which does not compare less than \p val
+ */
+template <
+    typename InputIteratorT,
+    typename OffsetT,
+    typename T>
+__device__ __forceinline__ OffsetT LowerBound(
+    InputIteratorT      input,              ///< [in] Input sequence
+    OffsetT             num_items,          ///< [in] Input sequence length
+    T                   val)                ///< [in] Search key
+{
+    OffsetT retval = 0;
+    while (num_items > 0)
+    {
+        OffsetT half = num_items >> 1;
+        if (input[retval + half] < val)
+        {
+            retval = retval + (half + 1);
+            num_items = num_items - (half + 1);
+        }
+        else
+        {
+            num_items = half;
+        }
+    }
+
+    return retval;
+}
+
+
+/**
+ * \brief Returns the offset of the first value within \p input which compares greater than \p val
+ */
+template <
+    typename InputIteratorT,
+    typename OffsetT,
+    typename T>
+__device__ __forceinline__ OffsetT UpperBound(
+    InputIteratorT      input,              ///< [in] Input sequence
+    OffsetT             num_items,          ///< [in] Input sequence length
+    T                   val)                ///< [in] Search key
+{
+    OffsetT retval = 0;
+    while (num_items > 0)
+    {
+        OffsetT half = num_items >> 1;
+        if (val < input[retval + half])
+        {
+            num_items = half;
+        }
+        else
+        {
+            retval = retval + (half + 1);
+            num_items = num_items - (half + 1);
+        }
+    }
+
+    return retval;
+}
+
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cpp/nvgraph/external/cub_semiring/thread/thread_store.cuh b/cpp/nvgraph/external/cub_semiring/thread/thread_store.cuh
new file mode 100644
index 00000000000..14ee84d9270
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/thread/thread_store.cuh
@@ -0,0 +1,422 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for writing memory using PTX cache modifiers.
+ */
+
+#pragma once
+
+#include <cuda.h>
+
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIo
+ * @{
+ */
+
+
+//-----------------------------------------------------------------------------
+// Tags and constants
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief Enumeration of cache modifiers for memory store operations.
+ */
+enum CacheStoreModifier
+{
+    STORE_DEFAULT,              ///< Default (no modifier)
+    STORE_WB,                   ///< Cache write-back all coherent levels
+    STORE_CG,                   ///< Cache at global level
+    STORE_CS,                   ///< Cache streaming (likely to be accessed once)
+    STORE_WT,                   ///< Cache write-through (to system memory)
+    STORE_VOLATILE,             ///< Volatile shared (any memory space)
+};
+
+
+/**
+ * \name Thread I/O (cache modified)
+ * @{
+ */
+
+/**
+ * \brief Thread utility for writing memory using cub::CacheStoreModifier cache modifiers.  Can be used to store any data type.
+ *
+ * \par Example
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/thread/thread_store.cuh>
+ *
+ * // 32-bit store using cache-global modifier:
+ * int *d_out;
+ * int val;
+ * cub::ThreadStore<cub::STORE_CG>(d_out + threadIdx.x, val);
+ *
+ * // 16-bit store using default modifier
+ * short *d_out;
+ * short val;
+ * cub::ThreadStore<cub::STORE_DEFAULT>(d_out + threadIdx.x, val);
+ *
+ * // 256-bit store using write-through modifier
+ * double4 *d_out;
+ * double4 val;
+ * cub::ThreadStore<cub::STORE_WT>(d_out + threadIdx.x, val);
+ *
+ * // 96-bit store using cache-streaming cache modifier
+ * struct TestFoo { bool a; short b; };
+ * TestFoo *d_struct;
+ * TestFoo val;
+ * cub::ThreadStore<cub::STORE_CS>(d_out + threadIdx.x, val);
+ * \endcode
+ *
+ * \tparam MODIFIER             <b>[inferred]</b> CacheStoreModifier enumeration
+ * \tparam InputIteratorT       <b>[inferred]</b> Output iterator type \iterator
+ * \tparam T                    <b>[inferred]</b> Data type of output value
+ */
+template <
+    CacheStoreModifier  MODIFIER,
+    typename            OutputIteratorT,
+    typename            T>
+__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val);
+
+
+//@}  end member group
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/// Helper structure for templated store iteration (inductive case)
+template <int COUNT, int MAX>
+struct IterateThreadStore
+{
+    template <CacheStoreModifier MODIFIER, typename T>
+    static __device__ __forceinline__ void Store(T *ptr, T *vals)
+    {
+        ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]);
+        IterateThreadStore<COUNT + 1, MAX>::template Store<MODIFIER>(ptr, vals);
+    }
+
+    template <typename OutputIteratorT, typename T>
+    static __device__ __forceinline__ void Dereference(OutputIteratorT ptr, T *vals)
+    {
+        ptr[COUNT] = vals[COUNT];
+        IterateThreadStore<COUNT + 1, MAX>::Dereference(ptr, vals);
+    }
+
+};
+
+/// Helper structure for templated store iteration (termination case)
+template <int MAX>
+struct IterateThreadStore<MAX, MAX>
+{
+    template <CacheStoreModifier MODIFIER, typename T>
+    static __device__ __forceinline__ void Store(T * /*ptr*/, T * /*vals*/) {}
+
+    template <typename OutputIteratorT, typename T>
+    static __device__ __forceinline__ void Dereference(OutputIteratorT /*ptr*/, T * /*vals*/) {}
+};
+
+
+/**
+ * Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_16(cub_modifier, ptx_modifier)                                            \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, uint4*, uint4>(uint4* ptr, uint4 val)                         \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : :               \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "r"(val.x),                                                                     \
+            "r"(val.y),                                                                     \
+            "r"(val.z),                                                                     \
+            "r"(val.w));                                                                    \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, ulonglong2*, ulonglong2>(ulonglong2* ptr, ulonglong2 val)     \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : :                       \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "l"(val.x),                                                                     \
+            "l"(val.y));                                                                    \
+    }
+
+
+/**
+ * Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_8(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, ushort4*, ushort4>(ushort4* ptr, ushort4 val)                 \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : :               \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "h"(val.x),                                                                     \
+            "h"(val.y),                                                                     \
+            "h"(val.z),                                                                     \
+            "h"(val.w));                                                                    \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, uint2*, uint2>(uint2* ptr, uint2 val)                         \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : :                       \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "r"(val.x),                                                                     \
+            "r"(val.y));                                                                    \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned long long*, unsigned long long>(unsigned long long* ptr, unsigned long long val)     \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : :                                \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "l"(val));                                                                      \
+    }
+
+/**
+ * Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_4(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned int*, unsigned int>(unsigned int* ptr, unsigned int val)                             \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : :                                \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "r"(val));                                                                      \
+    }
+
+
+/**
+ * Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_2(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned short*, unsigned short>(unsigned short* ptr, unsigned short val)                     \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : :                                \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "h"(val));                                                                      \
+    }
+
+
+/**
+ * Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_1(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned char*, unsigned char>(unsigned char* ptr, unsigned char val)                         \
+    {                                                                                       \
+        asm volatile (                                                                      \
+        "{"                                                                                 \
+        "   .reg .u8 datum;"                                                                \
+        "   cvt.u8.u16 datum, %1;"                                                          \
+        "   st."#ptx_modifier".u8 [%0], datum;"                                             \
+        "}" : :                                                                             \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "h"((unsigned short) val));                                                               \
+    }
+
+/**
+ * Define powers-of-two ThreadStore specializations for the given Cache load modifier
+ */
+#define _CUB_STORE_ALL(cub_modifier, ptx_modifier)                                           \
+    _CUB_STORE_16(cub_modifier, ptx_modifier)                                                \
+    _CUB_STORE_8(cub_modifier, ptx_modifier)                                                 \
+    _CUB_STORE_4(cub_modifier, ptx_modifier)                                                 \
+    _CUB_STORE_2(cub_modifier, ptx_modifier)                                                 \
+    _CUB_STORE_1(cub_modifier, ptx_modifier)                                                 \
+
+
+/**
+ * Define ThreadStore specializations for the various Cache load modifiers
+ */
+#if CUB_PTX_ARCH >= 200
+    _CUB_STORE_ALL(STORE_WB, wb)
+    _CUB_STORE_ALL(STORE_CG, cg)
+    _CUB_STORE_ALL(STORE_CS, cs)
+    _CUB_STORE_ALL(STORE_WT, wt)
+#else
+    _CUB_STORE_ALL(STORE_WB, global)
+    _CUB_STORE_ALL(STORE_CG, global)
+    _CUB_STORE_ALL(STORE_CS, global)
+    _CUB_STORE_ALL(STORE_WT, volatile.global)
+#endif
+
+
+// Macro cleanup
+#undef _CUB_STORE_ALL
+#undef _CUB_STORE_1
+#undef _CUB_STORE_2
+#undef _CUB_STORE_4
+#undef _CUB_STORE_8
+#undef _CUB_STORE_16
+
+
+/**
+ * ThreadStore definition for STORE_DEFAULT modifier on iterator types
+ */
+template <typename OutputIteratorT, typename T>
+__device__ __forceinline__ void ThreadStore(
+    OutputIteratorT             itr,
+    T                           val,
+    Int2Type<STORE_DEFAULT>     /*modifier*/,
+    Int2Type<false>             /*is_pointer*/)
+{
+    *itr = val;
+}
+
+
+/**
+ * ThreadStore definition for STORE_DEFAULT modifier on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStore(
+    T                           *ptr,
+    T                           val,
+    Int2Type<STORE_DEFAULT>     /*modifier*/,
+    Int2Type<true>              /*is_pointer*/)
+{
+    *ptr = val;
+}
+
+
+/**
+ * ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStoreVolatilePtr(
+    T                           *ptr,
+    T                           val,
+    Int2Type<true>              /*is_primitive*/)
+{
+    *reinterpret_cast<volatile T*>(ptr) = val;
+}
+
+
+/**
+ * ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStoreVolatilePtr(
+    T                           *ptr,
+    T                           val,
+    Int2Type<false>             /*is_primitive*/)
+{
+    // Create a temporary using shuffle-words, then store using volatile-words
+    typedef typename UnitWord<T>::VolatileWord  VolatileWord;  
+    typedef typename UnitWord<T>::ShuffleWord   ShuffleWord;
+
+    const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
+    const int SHUFFLE_MULTIPLE  = sizeof(T) / sizeof(ShuffleWord);
+    
+    VolatileWord words[VOLATILE_MULTIPLE];
+
+    #pragma unroll
+    for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
+        reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
+
+    IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference(
+        reinterpret_cast<volatile VolatileWord*>(ptr),
+        words);
+}
+
+
+/**
+ * ThreadStore definition for STORE_VOLATILE modifier on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStore(
+    T                           *ptr,
+    T                           val,
+    Int2Type<STORE_VOLATILE>    /*modifier*/,
+    Int2Type<true>              /*is_pointer*/)
+{
+    ThreadStoreVolatilePtr(ptr, val, Int2Type<Traits<T>::PRIMITIVE>());
+}
+
+
+/**
+ * ThreadStore definition for generic modifiers on pointer types
+ */
+template <typename T, int MODIFIER>
+__device__ __forceinline__ void ThreadStore(
+    T                           *ptr,
+    T                           val,
+    Int2Type<MODIFIER>          /*modifier*/,
+    Int2Type<true>              /*is_pointer*/)
+{
+    // Create a temporary using shuffle-words, then store using device-words
+    typedef typename UnitWord<T>::DeviceWord    DeviceWord;  
+    typedef typename UnitWord<T>::ShuffleWord   ShuffleWord;
+
+    const int DEVICE_MULTIPLE   = sizeof(T) / sizeof(DeviceWord);
+    const int SHUFFLE_MULTIPLE  = sizeof(T) / sizeof(ShuffleWord);
+    
+    DeviceWord words[DEVICE_MULTIPLE];
+
+    #pragma unroll
+    for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
+        reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
+
+    IterateThreadStore<0, DEVICE_MULTIPLE>::template Store<CacheStoreModifier(MODIFIER)>(
+        reinterpret_cast<DeviceWord*>(ptr),
+        words);
+}
+
+
+/**
+ * ThreadStore definition for generic modifiers
+ */
+template <CacheStoreModifier MODIFIER, typename OutputIteratorT, typename T>
+__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val)
+{
+    ThreadStore(
+        itr,
+        val,
+        Int2Type<MODIFIER>(),
+        Int2Type<IsPointer<OutputIteratorT>::VALUE>());
+}
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group UtilIo
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cpp/nvgraph/external/cub_semiring/util_allocator.cuh b/cpp/nvgraph/external/cub_semiring/util_allocator.cuh
new file mode 100644
index 00000000000..24c7a79fee5
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/util_allocator.cuh
@@ -0,0 +1,708 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple caching allocator for device memory allocations. The allocator is
+ * thread-safe and capable of managing device allocations on multiple devices.
+ ******************************************************************************/
+
+#pragma once
+
+#include "util_namespace.cuh"
+#include "util_debug.cuh"
+
+#include <set>
+#include <map>
+
+#include "host/mutex.cuh"
+#include <math.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilMgmt
+ * @{
+ */
+
+
+/******************************************************************************
+ * CachingDeviceAllocator (host use)
+ ******************************************************************************/
+
+/**
+ * \brief A simple caching allocator for device memory allocations.
+ *
+ * \par Overview
+ * The allocator is thread-safe and stream-safe and is capable of managing cached
+ * device allocations on multiple devices.  It behaves as follows:
+ *
+ * \par
+ * - Allocations from the allocator are associated with an \p active_stream.  Once freed,
+ *   the allocation becomes available immediately for reuse within the \p active_stream
+ *   with which it was associated with during allocation, and it becomes available for
+ *   reuse within other streams when all prior work submitted to \p active_stream has completed.
+ * - Allocations are categorized and cached by bin size.  A new allocation request of
+ *   a given size will only consider cached allocations within the corresponding bin.
+ * - Bin limits progress geometrically in accordance with the growth factor
+ *   \p bin_growth provided during construction.  Unused device allocations within
+ *   a larger bin cache are not reused for allocation requests that categorize to
+ *   smaller bin sizes.
+ * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
+ *   (\p bin_growth ^ \p min_bin).
+ * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
+ *   bin and are simply freed when they are deallocated instead of being returned
+ *   to a bin-cache.
+ * - %If the total storage of cached allocations on a given device will exceed
+ *   \p max_cached_bytes, allocations for that device are simply freed when they are
+ *   deallocated instead of being returned to their bin-cache.
+ *
+ * \par
+ * For example, the default-constructed CachingDeviceAllocator is configured with:
+ * - \p bin_growth          = 8
+ * - \p min_bin             = 3
+ * - \p max_bin             = 7
+ * - \p max_cached_bytes    = 6MB - 1B
+ *
+ * \par
+ * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
+ * and sets a maximum of 6,291,455 cached bytes per device
+ *
+ */
+struct CachingDeviceAllocator
+{
+
+    //---------------------------------------------------------------------
+    // Constants
+    //---------------------------------------------------------------------
+
+    /// Out-of-bounds bin
+    static const unsigned int INVALID_BIN = (unsigned int) -1;
+
+    /// Invalid size
+    static const size_t INVALID_SIZE = (size_t) -1;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /// Invalid device ordinal
+    static const int INVALID_DEVICE_ORDINAL = -1;
+
+    //---------------------------------------------------------------------
+    // Type definitions and helper types
+    //---------------------------------------------------------------------
+
+    /**
+     * Descriptor for device memory allocations
+     */
+    struct BlockDescriptor
+    {
+        void*           d_ptr;              // Device pointer
+        size_t          bytes;              // Size of allocation in bytes
+        unsigned int    bin;                // Bin enumeration
+        int             device;             // device ordinal
+        cudaStream_t    associated_stream;  // Associated associated_stream
+        cudaEvent_t     ready_event;        // Signal when associated stream has run to the point at which this block was freed
+
+        // Constructor (suitable for searching maps for a specific block, given its pointer and device)
+        BlockDescriptor(void *d_ptr, int device) :
+            d_ptr(d_ptr),
+            bytes(0),
+            bin(INVALID_BIN),
+            device(device),
+            associated_stream(0),
+            ready_event(0)
+        {}
+
+        // Constructor (suitable for searching maps for a range of suitable blocks, given a device)
+        BlockDescriptor(int device) :
+            d_ptr(NULL),
+            bytes(0),
+            bin(INVALID_BIN),
+            device(device),
+            associated_stream(0),
+            ready_event(0)
+        {}
+
+        // Comparison functor for comparing device pointers
+        static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
+        {
+            if (a.device == b.device)
+                return (a.d_ptr < b.d_ptr);
+            else
+                return (a.device < b.device);
+        }
+
+        // Comparison functor for comparing allocation sizes
+        static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
+        {
+            if (a.device == b.device)
+                return (a.bytes < b.bytes);
+            else
+                return (a.device < b.device);
+        }
+    };
+
+    /// BlockDescriptor comparator function interface
+    typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
+
+    class TotalBytes {
+    public:
+        size_t free;
+        size_t live;
+        TotalBytes() { free = live = 0; }
+    };
+
+    /// Set type for cached blocks (ordered by size)
+    typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
+
+    /// Set type for live blocks (ordered by ptr)
+    typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
+
+    /// Map type of device ordinals to the number of cached bytes cached by each device
+    typedef std::map<int, TotalBytes> GpuCachedBytes;
+
+
+    //---------------------------------------------------------------------
+    // Utility functions
+    //---------------------------------------------------------------------
+
+    /**
+     * Integer pow function for unsigned base and exponent
+     */
+    static unsigned int IntPow(
+        unsigned int base,
+        unsigned int exp)
+    {
+        unsigned int retval = 1;
+        while (exp > 0)
+        {
+            if (exp & 1) {
+                retval = retval * base;        // multiply the result by the current base
+            }
+            base = base * base;                // square the base
+            exp = exp >> 1;                    // divide the exponent in half
+        }
+        return retval;
+    }
+
+
+    /**
+     * Round up to the nearest power-of
+     */
+    void NearestPowerOf(
+        unsigned int    &power,
+        size_t          &rounded_bytes,
+        unsigned int    base,
+        size_t          value)
+    {
+        power = 0;
+        rounded_bytes = 1;
+
+        if (value * base < value)
+        {
+            // Overflow
+            power = sizeof(size_t) * 8;
+            rounded_bytes = size_t(0) - 1;
+            return;
+        }
+
+        while (rounded_bytes < value)
+        {
+            rounded_bytes *= base;
+            power++;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Fields
+    //---------------------------------------------------------------------
+
+    cub::Mutex      mutex;              /// Mutex for thread-safety
+
+    unsigned int    bin_growth;         /// Geometric growth factor for bin-sizes
+    unsigned int    min_bin;            /// Minimum bin enumeration
+    unsigned int    max_bin;            /// Maximum bin enumeration
+
+    size_t          min_bin_bytes;      /// Minimum bin size
+    size_t          max_bin_bytes;      /// Maximum bin size
+    size_t          max_cached_bytes;   /// Maximum aggregate cached bytes per device
+
+    const bool      skip_cleanup;       /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
+    bool            debug;              /// Whether or not to print (de)allocation events to stdout
+
+    GpuCachedBytes  cached_bytes;       /// Map of device ordinal to aggregate cached bytes on that device
+    CachedBlocks    cached_blocks;      /// Set of cached device allocations available for reuse
+    BusyBlocks      live_blocks;        /// Set of live device allocations currently in use
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    //---------------------------------------------------------------------
+    // Methods
+    //---------------------------------------------------------------------
+
+    /**
+     * \brief Constructor.
+     */
+    CachingDeviceAllocator(
+        unsigned int    bin_growth,                             ///< Geometric growth factor for bin-sizes
+        unsigned int    min_bin             = 1,                ///< Minimum bin (default is bin_growth ^ 1)
+        unsigned int    max_bin             = INVALID_BIN,      ///< Maximum bin (default is no max bin)
+        size_t          max_cached_bytes    = INVALID_SIZE,     ///< Maximum aggregate cached bytes per device (default is no limit)
+        bool            skip_cleanup        = false,            ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate)
+        bool            debug               = false)            ///< Whether or not to print (de)allocation events to stdout (default is no stderr output)
+    :
+        bin_growth(bin_growth),
+        min_bin(min_bin),
+        max_bin(max_bin),
+        min_bin_bytes(IntPow(bin_growth, min_bin)),
+        max_bin_bytes(IntPow(bin_growth, max_bin)),
+        max_cached_bytes(max_cached_bytes),
+        skip_cleanup(skip_cleanup),
+        debug(debug),
+        cached_blocks(BlockDescriptor::SizeCompare),
+        live_blocks(BlockDescriptor::PtrCompare)
+    {}
+
+
+    /**
+     * \brief Default constructor.
+     *
+     * Configured with:
+     * \par
+     * - \p bin_growth          = 8
+     * - \p min_bin             = 3
+     * - \p max_bin             = 7
+     * - \p max_cached_bytes    = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
+     *
+     * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
+     * sets a maximum of 6,291,455 cached bytes per device
+     */
+    CachingDeviceAllocator(
+        bool skip_cleanup = false,
+        bool debug = false)
+    :
+        bin_growth(8),
+        min_bin(3),
+        max_bin(7),
+        min_bin_bytes(IntPow(bin_growth, min_bin)),
+        max_bin_bytes(IntPow(bin_growth, max_bin)),
+        max_cached_bytes((max_bin_bytes * 3) - 1),
+        skip_cleanup(skip_cleanup),
+        debug(debug),
+        cached_blocks(BlockDescriptor::SizeCompare),
+        live_blocks(BlockDescriptor::PtrCompare)
+    {}
+
+
+    /**
+     * \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
+     *
+     * Changing the ceiling of cached bytes does not cause any allocations (in-use or
+     * cached-in-reserve) to be freed.  See \p FreeAllCached().
+     */
+    cudaError_t SetMaxCachedBytes(
+        size_t max_cached_bytes)
+    {
+        // Lock
+        mutex.Lock();
+
+        if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes);
+
+        this->max_cached_bytes = max_cached_bytes;
+
+        // Unlock
+        mutex.Unlock();
+
+        return cudaSuccess;
+    }
+
+
+    /**
+     * \brief Provides a suitable allocation of device memory for the given size on the specified device.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceAllocate(
+        int             device,             ///< [in] Device on which to place the allocation
+        void            **d_ptr,            ///< [out] Reference to pointer to the allocation
+        size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
+        cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
+    {
+        *d_ptr                          = NULL;
+        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
+        cudaError_t error               = cudaSuccess;
+
+        if (device == INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
+            device = entrypoint_device;
+        }
+
+        // Create a block descriptor for the requested allocation
+        bool found = false;
+        BlockDescriptor search_key(device);
+        search_key.associated_stream = active_stream;
+        NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
+
+        if (search_key.bin > max_bin)
+        {
+            // Bin is greater than our maximum bin: allocate the request
+            // exactly and give out-of-bounds bin.  It will not be cached
+            // for reuse when returned.
+            search_key.bin      = INVALID_BIN;
+            search_key.bytes    = bytes;
+        }
+        else
+        {
+            // Search for a suitable cached allocation: lock
+            mutex.Lock();
+
+            if (search_key.bin < min_bin)
+            {
+                // Bin is less than minimum bin: round up
+                search_key.bin      = min_bin;
+                search_key.bytes    = min_bin_bytes;
+            }
+
+            // Iterate through the range of cached blocks on the same device in the same bin
+            CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
+            while ((block_itr != cached_blocks.end())
+                    && (block_itr->device == device)
+                    && (block_itr->bin == search_key.bin))
+            {
+                // To prevent races with reusing blocks returned by the host but still
+                // in use by the device, only consider cached blocks that are
+                // either (from the active stream) or (from an idle stream)
+                if ((active_stream == block_itr->associated_stream) ||
+                    (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady))
+                {
+                    // Reuse existing cache block.  Insert into live blocks.
+                    found = true;
+                    search_key = *block_itr;
+                    search_key.associated_stream = active_stream;
+                    live_blocks.insert(search_key);
+
+                    // Remove from free blocks
+                    cached_bytes[device].free -= search_key.bytes;
+                    cached_bytes[device].live += search_key.bytes;
+
+                    if (debug) _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n",
+                        device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long)  block_itr->associated_stream);
+
+                    cached_blocks.erase(block_itr);
+
+                    break;
+                }
+                block_itr++;
+            }
+
+            // Done searching: unlock
+            mutex.Unlock();
+        }
+
+        // Allocate the block if necessary
+        if (!found)
+        {
+            // Set runtime's current device to specified device (entrypoint may not be set)
+            if (device != entrypoint_device)
+            {
+                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
+                if (CubDebug(error = cudaSetDevice(device))) return error;
+            }
+
+            // Attempt to allocate
+            if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation)
+            {
+                // The allocation attempt failed: free all cached blocks on device and retry
+                if (debug) _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
+                      device, (long long) search_key.bytes, (long long) search_key.associated_stream);
+
+                error = cudaSuccess;    // Reset the error we will return
+                cudaGetLastError();     // Reset CUDART's error
+
+                // Lock
+                mutex.Lock();
+
+                // Iterate the range of free blocks on the same device
+                BlockDescriptor free_key(device);
+                CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);
+
+                while ((block_itr != cached_blocks.end()) && (block_itr->device == device))
+                {
+                    // No need to worry about synchronization with the device: cudaFree is
+                    // blocking and will synchronize across all kernels executing
+                    // on the current device
+
+                    // Free device memory and destroy stream event.
+                    if (CubDebug(error = cudaFree(block_itr->d_ptr))) break;
+                    if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break;
+
+                    // Reduce balance and erase entry
+                    cached_bytes[device].free -= block_itr->bytes;
+
+                    if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                        device, (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+
+                    cached_blocks.erase(block_itr);
+
+                    block_itr++;
+                }
+
+                // Unlock
+                mutex.Unlock();
+
+                // Return under error
+                if (error) return error;
+
+                // Try to allocate again
+                if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) return error;
+            }
+
+            // Create ready event
+            if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
+                return error;
+
+            // Insert into live blocks
+            mutex.Lock();
+            live_blocks.insert(search_key);
+            cached_bytes[device].live += search_key.bytes;
+            mutex.Unlock();
+
+            if (debug) _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n",
+                      device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream);
+
+            // Attempt to revert back to previous device if necessary
+            if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
+            {
+                if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+            }
+        }
+
+        // Copy device pointer to output parameter
+        *d_ptr = search_key.d_ptr;
+
+        if (debug) _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
+            (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+
+        return error;
+    }
+
+
+    /**
+     * \brief Provides a suitable allocation of device memory for the given size on the current device.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceAllocate(
+        void            **d_ptr,            ///< [out] Reference to pointer to the allocation
+        size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
+        cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
+    {
+        return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
+    }
+
+
+    /**
+     * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceFree(
+        int             device,
+        void*           d_ptr)
+    {
+        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
+        cudaError_t error               = cudaSuccess;
+
+        if (device == INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
+                return error;
+            device = entrypoint_device;
+        }
+
+        // Lock
+        mutex.Lock();
+
+        // Find corresponding block descriptor
+        bool recached = false;
+        BlockDescriptor search_key(d_ptr, device);
+        BusyBlocks::iterator block_itr = live_blocks.find(search_key);
+        if (block_itr != live_blocks.end())
+        {
+            // Remove from live blocks
+            search_key = *block_itr;
+            live_blocks.erase(block_itr);
+            cached_bytes[device].live -= search_key.bytes;
+
+            // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
+            if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes))
+            {
+                // Insert returned allocation into free blocks
+                recached = true;
+                cached_blocks.insert(search_key);
+                cached_bytes[device].free += search_key.bytes;
+
+                if (debug) _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
+                    device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
+                    (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+            }
+        }
+
+        // Unlock
+        mutex.Unlock();
+
+        // First set to specified device (entrypoint may not be set)
+        if (device != entrypoint_device)
+        {
+            if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
+            if (CubDebug(error = cudaSetDevice(device))) return error;
+        }
+
+        if (recached)
+        {
+            // Insert the ready event in the associated stream (must have current device set properly)
+            if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error;
+        }
+        else
+        {
+            // Free the allocation from the runtime and cleanup the event.
+            if (CubDebug(error = cudaFree(d_ptr))) return error;
+            if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;
+
+            if (debug) _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+        }
+
+        // Reset device
+        if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
+        {
+            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+        }
+
+        return error;
+    }
+
+
+    /**
+     * \brief Frees a live allocation of device memory on the current device, returning it to the allocator.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceFree(
+        void*           d_ptr)
+    {
+        return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr);
+    }
+
+
+    /**
+     * \brief Frees all cached device allocations on all devices
+     */
+    cudaError_t FreeAllCached()
+    {
+        cudaError_t error         = cudaSuccess;
+        int entrypoint_device     = INVALID_DEVICE_ORDINAL;
+        int current_device        = INVALID_DEVICE_ORDINAL;
+
+        mutex.Lock();
+
+        while (!cached_blocks.empty())
+        {
+            // Get first block
+            CachedBlocks::iterator begin = cached_blocks.begin();
+
+            // Get entry-point device ordinal if necessary
+            if (entrypoint_device == INVALID_DEVICE_ORDINAL)
+            {
+                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
+            }
+
+            // Set current device ordinal if necessary
+            if (begin->device != current_device)
+            {
+                if (CubDebug(error = cudaSetDevice(begin->device))) break;
+                current_device = begin->device;
+            }
+
+            // Free device memory
+            if (CubDebug(error = cudaFree(begin->d_ptr))) break;
+            if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break;
+
+            // Reduce balance and erase entry
+            cached_bytes[current_device].free -= begin->bytes;
+
+            if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].live);
+
+            cached_blocks.erase(begin);
+        }
+
+        mutex.Unlock();
+
+        // Attempt to revert back to entry-point device if necessary
+        if (entrypoint_device != INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+        }
+
+        return error;
+    }
+
+
+    /**
+     * \brief Destructor
+     */
+    virtual ~CachingDeviceAllocator()
+    {
+        if (!skip_cleanup)
+            FreeAllCached();
+    }
+
+};
+
+
+
+
+/** @} */       // end group UtilMgmt
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cpp/nvgraph/external/cub_semiring/util_arch.cuh b/cpp/nvgraph/external/cub_semiring/util_arch.cuh
new file mode 100644
index 00000000000..5ec36e5f1f7
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/util_arch.cuh
@@ -0,0 +1,151 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Static architectural properties by SM version.
+ */
+
+#pragma once
+
+#include "util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+#if (__CUDACC_VER_MAJOR__ >= 9) && !defined(CUB_USE_COOPERATIVE_GROUPS)
+    #define CUB_USE_COOPERATIVE_GROUPS
+#endif
+
+/// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass).
+#ifndef CUB_PTX_ARCH
+    #ifndef __CUDA_ARCH__
+        #define CUB_PTX_ARCH 0
+    #else
+        #define CUB_PTX_ARCH __CUDA_ARCH__
+    #endif
+#endif
+
+
+/// Whether or not the source targeted by the active compiler pass is allowed to  invoke device kernels or methods from the CUDA runtime API.
+#ifndef CUB_RUNTIME_FUNCTION
+    #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__))
+        #define CUB_RUNTIME_ENABLED
+        #define CUB_RUNTIME_FUNCTION __host__ __device__
+    #else
+        #define CUB_RUNTIME_FUNCTION __host__
+    #endif
+#endif
+
+
+/// Number of threads per warp
+#ifndef CUB_LOG_WARP_THREADS
+    #define CUB_LOG_WARP_THREADS(arch)                      \
+        (5)
+    #define CUB_WARP_THREADS(arch)                          \
+        (1 << CUB_LOG_WARP_THREADS(arch))
+
+    #define CUB_PTX_WARP_THREADS        CUB_WARP_THREADS(CUB_PTX_ARCH)
+    #define CUB_PTX_LOG_WARP_THREADS    CUB_LOG_WARP_THREADS(CUB_PTX_ARCH)
+#endif
+
+
+/// Number of smem banks
+#ifndef CUB_LOG_SMEM_BANKS
+    #define CUB_LOG_SMEM_BANKS(arch)                        \
+        ((arch >= 200) ?                                    \
+            (5) :                                           \
+            (4))
+    #define CUB_SMEM_BANKS(arch)                            \
+        (1 << CUB_LOG_SMEM_BANKS(arch))
+
+    #define CUB_PTX_LOG_SMEM_BANKS      CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH)
+    #define CUB_PTX_SMEM_BANKS          CUB_SMEM_BANKS(CUB_PTX_ARCH)
+#endif
+
+
+/// Oversubscription factor
+#ifndef CUB_SUBSCRIPTION_FACTOR
+    #define CUB_SUBSCRIPTION_FACTOR(arch)                   \
+        ((arch >= 300) ?                                    \
+            (5) :                                           \
+            ((arch >= 200) ?                                \
+                (3) :                                       \
+                (10)))
+    #define CUB_PTX_SUBSCRIPTION_FACTOR             CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH)
+#endif
+
+
+/// Prefer padding overhead vs X-way conflicts greater than this threshold
+#ifndef CUB_PREFER_CONFLICT_OVER_PADDING
+    #define CUB_PREFER_CONFLICT_OVER_PADDING(arch)          \
+        ((arch >= 300) ?                                    \
+            (1) :                                           \
+            (4))
+    #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING    CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH)
+#endif
+
+
+/// Scale down the number of warps to keep same amount of "tile" storage as the nominal configuration for 4B data.  Minimum of two warps.
+#ifndef CUB_BLOCK_THREADS
+    #define CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)                        \
+        (CUB_MIN(                                                                           \
+            NOMINAL_4B_BLOCK_THREADS * 2,                                                   \
+            CUB_WARP_THREADS(PTX_ARCH) * CUB_MAX(                                           \
+                (NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 3 / 4,            \
+                (NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 4 / sizeof(T))))
+#endif
+
+/// Scale up/down number of items per thread to keep the same amount of "tile" storage as the nominal configuration for 4B data.  Minimum 1 item per thread
+#ifndef CUB_ITEMS_PER_THREAD
+    #define CUB_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)    \
+	    (CUB_MIN(                                                                                       \
+	        NOMINAL_4B_ITEMS_PER_THREAD * 2,                                                            \
+	        CUB_MAX(                                                                                    \
+	            1,                                                                                      \
+	            (NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4 / sizeof(T)) / CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH))))
+#endif
+
+/// Define both nominal threads-per-block and items-per-thread
+#ifndef CUB_NOMINAL_CONFIG
+    #define CUB_NOMINAL_CONFIG(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T)    \
+        CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, 200),                                \
+        CUB_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, 200)
+#endif
+
+
+
+#endif  // Do not document
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cpp/nvgraph/external/cub_semiring/util_debug.cuh b/cpp/nvgraph/external/cub_semiring/util_debug.cuh
new file mode 100644
index 00000000000..1ad60cf2db6
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/util_debug.cuh
@@ -0,0 +1,145 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Error and event logging routines.
+ *
+ * The following macros definitions are supported:
+ * - \p CUB_LOG.  Simple event messages are printed to \p stdout.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include "util_namespace.cuh"
+#include "util_arch.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilMgmt
+ * @{
+ */
+
+
+/// CUB error reporting macro (prints error messages to stderr)
+#if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR)
+    #define CUB_STDERR
+#endif
+
+
+
+/**
+ * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context.
+ *
+ * \return The CUDA error.
+ */
+__host__ __device__ __forceinline__ cudaError_t Debug(
+    cudaError_t     error,
+    const char*     filename,
+    int             line)
+{
+    (void)filename;
+    (void)line;
+#ifdef CUB_STDERR
+    if (error)
+    {
+    #if (CUB_PTX_ARCH == 0)
+        fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
+        fflush(stderr);
+    #elif (CUB_PTX_ARCH >= 200)
+        printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line);
+    #endif
+    }
+#endif
+    return error;
+}
+
+
+/**
+ * \brief Debug macro
+ */
+#ifndef CubDebug
+    #define CubDebug(e) cub::Debug((cudaError_t) (e), __FILE__, __LINE__)
+#endif
+
+
+/**
+ * \brief Debug macro with exit
+ */
+#ifndef CubDebugExit
+    #define CubDebugExit(e) if (cub::Debug((cudaError_t) (e), __FILE__, __LINE__)) { exit(1); }
+#endif
+
+
+/**
+ * \brief Log macro for printf statements.
+ */
+#if !defined(_CubLog)
+    #if !(defined(__clang__) && defined(__CUDA__))
+        #if (CUB_PTX_ARCH == 0)
+            #define _CubLog(format, ...) printf(format,__VA_ARGS__);
+        #elif (CUB_PTX_ARCH >= 200)
+            #define _CubLog(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__);
+        #endif
+    #else
+        // XXX shameless hack for clang around variadic printf...
+        //     Compilies w/o supplying -std=c++11 but shows warning,
+        //     so we sielence them :)
+        #pragma clang diagnostic ignored "-Wc++11-extensions"
+        #pragma clang diagnostic ignored "-Wunnamed-type-template-args"
+            template <class... Args>
+            inline __host__ __device__ void va_printf(char const* format, Args const&... args)
+            {
+        #ifdef __CUDA_ARCH__
+              printf(format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, args...);
+        #else
+              printf(format, args...);
+        #endif
+            }
+        #ifndef __CUDA_ARCH__
+            #define _CubLog(format, ...) va_printf(format,__VA_ARGS__);
+        #else
+            #define _CubLog(format, ...) va_printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, __VA_ARGS__);
+        #endif
+    #endif
+#endif
+
+
+
+
+/** @} */       // end group UtilMgmt
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cpp/nvgraph/external/cub_semiring/util_device.cuh b/cpp/nvgraph/external/cub_semiring/util_device.cuh
new file mode 100644
index 00000000000..fa73dbd74f1
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/util_device.cuh
@@ -0,0 +1,347 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Properties of a given CUDA device and the corresponding PTX bundle
+ */
+
+#pragma once
+
+#include "util_type.cuh"
+#include "util_arch.cuh"
+#include "util_debug.cuh"
+#include "util_namespace.cuh"
+#include "util_macro.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilMgmt
+ * @{
+ */
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/**
+ * Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed).
+ */
+template <int ALLOCATIONS>
+__host__ __device__ __forceinline__
+cudaError_t AliasTemporaries(
+    void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t  &temp_storage_bytes,                ///< [in,out] Size in bytes of \t d_temp_storage allocation
+    void*   (&allocations)[ALLOCATIONS],        ///< [in,out] Pointers to device allocations needed
+    size_t  (&allocation_sizes)[ALLOCATIONS])   ///< [in] Sizes in bytes of device allocations needed
+{
+    const int ALIGN_BYTES   = 256;
+    const int ALIGN_MASK    = ~(ALIGN_BYTES - 1);
+
+    // Compute exclusive prefix sum over allocation requests
+    size_t allocation_offsets[ALLOCATIONS];
+    size_t bytes_needed = 0;
+    for (int i = 0; i < ALLOCATIONS; ++i)
+    {
+        size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
+        allocation_offsets[i] = bytes_needed;
+        bytes_needed += allocation_bytes;
+    }
+    bytes_needed += ALIGN_BYTES - 1;
+
+    // Check if the caller is simply requesting the size of the storage allocation
+    if (!d_temp_storage)
+    {
+        temp_storage_bytes = bytes_needed;
+        return cudaSuccess;
+    }
+
+    // Check if enough storage provided
+    if (temp_storage_bytes < bytes_needed)
+    {
+        return CubDebug(cudaErrorInvalidValue);
+    }
+
+    // Alias
+    d_temp_storage = (void *) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK);
+    for (int i = 0; i < ALLOCATIONS; ++i)
+    {
+        allocations[i] = static_cast<char*>(d_temp_storage) + allocation_offsets[i];
+    }
+
+    return cudaSuccess;
+}
+
+
+/**
+ * Empty kernel for querying PTX manifest metadata (e.g., version) for the current device
+ */
+template <typename T>
+__global__ void EmptyKernel(void) { }
+
+
+#endif  // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
+ */
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
+{
+    struct Dummy
+    {
+        /// Type definition of the EmptyKernel kernel entry point
+        typedef void (*EmptyKernelPtr)();
+
+        /// Force EmptyKernel<void> to be generated if this class is used
+        CUB_RUNTIME_FUNCTION __forceinline__
+        EmptyKernelPtr Empty()
+        {
+            return EmptyKernel<void>;
+        }
+    };
+
+
+#ifndef CUB_RUNTIME_ENABLED
+    (void)ptx_version;
+
+    // CUDA API calls not supported from this device
+    return cudaErrorInvalidConfiguration;
+
+#elif (CUB_PTX_ARCH > 0)
+
+    ptx_version = CUB_PTX_ARCH;
+    return cudaSuccess;
+
+#else
+
+    cudaError_t error = cudaSuccess;
+    do
+    {
+        cudaFuncAttributes empty_kernel_attrs;
+        if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel<void>))) break;
+        ptx_version = empty_kernel_attrs.ptxVersion * 10;
+    }
+    while (0);
+
+    return error;
+
+#endif
+}
+
+
+/**
+ * \brief Retrieves the SM version (major * 100 + minor * 10)
+ */
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int device_ordinal)
+{
+#ifndef CUB_RUNTIME_ENABLED
+    (void)sm_version;
+    (void)device_ordinal;
+
+    // CUDA API calls not supported from this device
+    return cudaErrorInvalidConfiguration;
+
+#else
+
+    cudaError_t error = cudaSuccess;
+    do
+    {
+        // Fill in SM version
+        int major, minor;
+        if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break;
+        if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break;
+        sm_version = major * 100 + minor * 10;
+    }
+    while (0);
+
+    return error;
+
+#endif
+}
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Synchronize the stream if specified
+ */
+CUB_RUNTIME_FUNCTION __forceinline__
+static cudaError_t SyncStream(cudaStream_t stream)
+{
+#if (CUB_PTX_ARCH == 0)
+    return cudaStreamSynchronize(stream);
+#else
+    (void)stream;
+    // Device can't yet sync on a specific stream
+    return cudaDeviceSynchronize();
+#endif
+}
+
+
+/**
+ * \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of the MaxSmOccupancy function.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_device.cuh>
+ *
+ * template <typename T>
+ * __global__ void ExampleKernel()
+ * {
+ *     // Allocate shared memory for BlockScan
+ *     __shared__ volatile T buffer[4096];
+ *
+ *        ...
+ * }
+ *
+ *     ...
+ *
+ * // Determine SM occupancy for ExampleKernel specialized for unsigned char
+ * int max_sm_occupancy;
+ * MaxSmOccupancy(max_sm_occupancy, ExampleKernel<unsigned char>, 64);
+ *
+ * // max_sm_occupancy  <-- 4 on SM10
+ * // max_sm_occupancy  <-- 8 on SM20
+ * // max_sm_occupancy  <-- 12 on SM35
+ *
+ * \endcode
+ *
+ */
+template <typename KernelPtr>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t MaxSmOccupancy(
+    int                 &max_sm_occupancy,          ///< [out] maximum number of thread blocks that can reside on a single SM
+    KernelPtr           kernel_ptr,                 ///< [in] Kernel pointer for which to compute SM occupancy
+    int                 block_threads,              ///< [in] Number of threads per thread block
+    int                 dynamic_smem_bytes = 0)
+{
+#ifndef CUB_RUNTIME_ENABLED
+    (void)dynamic_smem_bytes;
+    (void)block_threads;
+    (void)kernel_ptr;
+    (void)max_sm_occupancy;
+
+    // CUDA API calls not supported from this device
+    return CubDebug(cudaErrorInvalidConfiguration);
+
+#else
+
+    return cudaOccupancyMaxActiveBlocksPerMultiprocessor (
+        &max_sm_occupancy,
+        kernel_ptr,
+        block_threads,
+        dynamic_smem_bytes);
+
+#endif  // CUB_RUNTIME_ENABLED
+}
+
+
+/******************************************************************************
+ * Policy management
+ ******************************************************************************/
+
+/**
+ * Kernel dispatch configuration
+ */
+struct KernelConfig
+{
+    int block_threads;
+    int items_per_thread;
+    int tile_size;
+    int sm_occupancy;
+
+    CUB_RUNTIME_FUNCTION __forceinline__
+    KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {}
+
+    template <typename AgentPolicyT, typename KernelPtrT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Init(KernelPtrT kernel_ptr)
+    {
+        block_threads        = AgentPolicyT::BLOCK_THREADS;
+        items_per_thread     = AgentPolicyT::ITEMS_PER_THREAD;
+        tile_size            = block_threads * items_per_thread;
+        cudaError_t retval   = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads);
+        return retval;
+    }
+};
+
+
+
+/// Helper for dispatching into a policy chain
+template <int PTX_VERSION, typename PolicyT, typename PrevPolicyT>
+struct ChainedPolicy
+{
+   /// The policy for the active compiler pass
+   typedef typename If<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>::Type ActivePolicy;
+
+   /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
+   template <typename FunctorT>
+   CUB_RUNTIME_FUNCTION __forceinline__
+   static cudaError_t Invoke(int ptx_version, FunctorT &op)
+   {
+       if (ptx_version < PTX_VERSION) {
+           return PrevPolicyT::Invoke(ptx_version, op);
+       }
+       return op.template Invoke<PolicyT>();
+   }
+};
+
+/// Helper for dispatching into a policy chain (end-of-chain specialization)
+template <int PTX_VERSION, typename PolicyT>
+struct ChainedPolicy<PTX_VERSION, PolicyT, PolicyT>
+{
+    /// The policy for the active compiler pass
+    typedef PolicyT ActivePolicy;
+
+    /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
+    template <typename FunctorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Invoke(int /*ptx_version*/, FunctorT &op) {
+        return op.template Invoke<PolicyT>();
+    }
+};
+
+
+
+
+#endif  // Do not document
+
+
+
+
+/** @} */       // end group UtilMgmt
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cpp/nvgraph/external/cub_semiring/util_macro.cuh b/cpp/nvgraph/external/cub_semiring/util_macro.cuh
new file mode 100644
index 00000000000..73c29d22c5c
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/util_macro.cuh
@@ -0,0 +1,103 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Common C/C++ macro utilities
+ ******************************************************************************/
+
+#pragma once
+
+#include "util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+#ifndef CUB_ALIGN
+    #if defined(_WIN32) || defined(_WIN64)
+        /// Align struct
+        #define CUB_ALIGN(bytes) __declspec(align(32))
+    #else
+        /// Align struct
+        #define CUB_ALIGN(bytes) __attribute__((aligned(bytes)))
+    #endif
+#endif
+
+#ifndef CUB_MAX
+    /// Select maximum(a, b)
+    #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a))
+#endif
+
+#ifndef CUB_MIN
+    /// Select minimum(a, b)
+    #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a))
+#endif
+
+#ifndef CUB_QUOTIENT_FLOOR
+    /// Quotient of x/y rounded down to nearest integer
+    #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y))
+#endif
+
+#ifndef CUB_QUOTIENT_CEILING
+    /// Quotient of x/y rounded up to nearest integer
+    #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y))
+#endif
+
+#ifndef CUB_ROUND_UP_NEAREST
+    /// x rounded up to the nearest multiple of y
+    #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y)
+#endif
+
+#ifndef CUB_ROUND_DOWN_NEAREST
+    /// x rounded down to the nearest multiple of y
+    #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y)
+#endif
+
+
+#ifndef CUB_STATIC_ASSERT
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+        #define CUB_CAT_(a, b) a ## b
+        #define CUB_CAT(a, b) CUB_CAT_(a, b)
+    #endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    /// Static assert
+    #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1]
+#endif
+
+/** @} */       // end group UtilModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cpp/nvgraph/external/cub_semiring/util_namespace.cuh b/cpp/nvgraph/external/cub_semiring/util_namespace.cuh
new file mode 100644
index 00000000000..9f488e96978
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/util_namespace.cuh
@@ -0,0 +1,46 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Place-holder for prefixing the cub namespace
+ */
+
+#pragma once
+
+// For example:
+#define CUB_NS_PREFIX namespace cub_semiring {
+#define CUB_NS_POSTFIX }
+
+#ifndef CUB_NS_PREFIX
+#define CUB_NS_PREFIX
+#endif
+
+#ifndef CUB_NS_POSTFIX
+#define CUB_NS_POSTFIX
+#endif
diff --git a/cpp/nvgraph/external/cub_semiring/util_ptx.cuh b/cpp/nvgraph/external/cub_semiring/util_ptx.cuh
new file mode 100644
index 00000000000..fae6e4fae2e
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/util_ptx.cuh
@@ -0,0 +1,729 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * PTX intrinsics
+ */
+
+
+#pragma once
+
+#include "util_type.cuh"
+#include "util_arch.cuh"
+#include "util_namespace.cuh"
+#include "util_debug.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilPtx
+ * @{
+ */
+
+
+/******************************************************************************
+ * PTX helper macros
+ ******************************************************************************/
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Register modifier for pointer-types (for inlining PTX assembly)
+ */
+#if defined(_WIN64) || defined(__LP64__)
+    #define __CUB_LP64__ 1
+    // 64-bit register modifier for inlined asm
+    #define _CUB_ASM_PTR_ "l"
+    #define _CUB_ASM_PTR_SIZE_ "u64"
+#else
+    #define __CUB_LP64__ 0
+    // 32-bit register modifier for inlined asm
+    #define _CUB_ASM_PTR_ "r"
+    #define _CUB_ASM_PTR_SIZE_ "u32"
+#endif
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * Inlined PTX intrinsics
+ ******************************************************************************/
+
+/**
+ * \brief Shift-right then add.  Returns (\p x >> \p shift) + \p addend.
+ */
+__device__ __forceinline__ unsigned int SHR_ADD(
+    unsigned int x,
+    unsigned int shift,
+    unsigned int addend)
+{
+    unsigned int ret;
+#if CUB_PTX_ARCH >= 200
+    asm ("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
+        "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
+#else
+    ret = (x >> shift) + addend;
+#endif
+    return ret;
+}
+
+
+/**
+ * \brief Shift-left then add.  Returns (\p x << \p shift) + \p addend.
+ */
+__device__ __forceinline__ unsigned int SHL_ADD(
+    unsigned int x,
+    unsigned int shift,
+    unsigned int addend)
+{
+    unsigned int ret;
+#if CUB_PTX_ARCH >= 200
+    asm ("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
+        "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
+#else
+    ret = (x << shift) + addend;
+#endif
+    return ret;
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Bitfield-extract.
+ */
+template <typename UnsignedBits, int BYTE_LEN>
+__device__ __forceinline__ unsigned int BFE(
+    UnsignedBits            source,
+    unsigned int            bit_start,
+    unsigned int            num_bits,
+    Int2Type<BYTE_LEN>      /*byte_len*/)
+{
+    unsigned int bits;
+#if CUB_PTX_ARCH >= 200
+    asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits));
+#else
+    const unsigned int MASK = (1 << num_bits) - 1;
+    bits = (source >> bit_start) & MASK;
+#endif
+    return bits;
+}
+
+
+/**
+ * Bitfield-extract for 64-bit types.
+ */
+template <typename UnsignedBits>
+__device__ __forceinline__ unsigned int BFE(
+    UnsignedBits            source,
+    unsigned int            bit_start,
+    unsigned int            num_bits,
+    Int2Type<8>             /*byte_len*/)
+{
+    const unsigned long long MASK = (1ull << num_bits) - 1;
+    return (source >> bit_start) & MASK;
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Bitfield-extract.  Extracts \p num_bits from \p source starting at bit-offset \p bit_start.  The input \p source may be an 8b, 16b, 32b, or 64b unsigned integer type.
+ */
+template <typename UnsignedBits>
+__device__ __forceinline__ unsigned int BFE(
+    UnsignedBits source,
+    unsigned int bit_start,
+    unsigned int num_bits)
+{
+    return BFE(source, bit_start, num_bits, Int2Type<sizeof(UnsignedBits)>());
+}
+
+
+/**
+ * \brief Bitfield insert.  Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start.
+ */
+__device__ __forceinline__ void BFI(
+    unsigned int &ret,
+    unsigned int x,
+    unsigned int y,
+    unsigned int bit_start,
+    unsigned int num_bits)
+{
+#if CUB_PTX_ARCH >= 200
+    asm ("bfi.b32 %0, %1, %2, %3, %4;" :
+        "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits));
+#else
+    x <<= bit_start;
+    unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start;
+    unsigned int MASK_Y = ~MASK_X;
+    ret = (y & MASK_Y) | (x & MASK_X);
+#endif
+}
+
+
+/**
+ * \brief Three-operand add.  Returns \p x + \p y + \p z.
+ */
+__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
+{
+#if CUB_PTX_ARCH >= 200
+    asm ("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
+#else
+    x = x + y + z;
+#endif
+    return x;
+}
+
+
+/**
+ * \brief Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register.  For SM2.0 or later.
+ *
+ * \par
+ * The bytes in the two source registers \p a and \p b are numbered from 0 to 7:
+ * {\p b, \p a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes
+ * {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within
+ * the four lower "nibbles" of \p index: {\p index } = {n7, n6, n5, n4, n3, n2, n1, n0}
+ *
+ * \par Snippet
+ * The code snippet below illustrates byte-permute.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     int a        = 0x03020100;
+ *     int b        = 0x07060504;
+ *     int index    = 0x00007531;
+ *
+ *     int selected = PRMT(a, b, index);    // 0x07050301
+ *
+ * \endcode
+ *
+ */
+__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)
+{
+    int ret;
+    asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
+    return ret;
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Sync-threads barrier.
+ */
+__device__ __forceinline__ void BAR(int count)
+{
+    asm volatile("bar.sync 1, %0;" : : "r"(count));
+}
+
+/**
+ * CTA barrier
+ */
+__device__  __forceinline__ void CTA_SYNC()
+{
+    __syncthreads();
+}
+
+
+/**
+ * CTA barrier with predicate
+ */
+__device__  __forceinline__ int CTA_SYNC_AND(int p)
+{
+    return __syncthreads_and(p);
+}
+
+
+/**
+ * Warp barrier
+ */
+__device__  __forceinline__ void WARP_SYNC(unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    __syncwarp(member_mask);
+#endif
+}
+
+
+/**
+ * Warp any
+ */
+__device__  __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    return __any_sync(member_mask, predicate);
+#else
+    return ::__any(predicate);
+#endif
+}
+
+
+/**
+ * Warp any
+ */
+__device__  __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    return __all_sync(member_mask, predicate);
+#else
+    return ::__all(predicate);
+#endif
+}
+
+
+/**
+ * Warp ballot
+ */
+__device__  __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    return __ballot_sync(member_mask, predicate);
+#else
+    return __ballot(predicate);
+#endif
+}
+
+/**
+ * Warp synchronous shfl_up
+ */
+__device__ __forceinline__ 
+unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int first_lane, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
+        : "=r"(word) : "r"(word), "r"(src_offset), "r"(first_lane), "r"(member_mask));
+#else
+    asm volatile("shfl.up.b32 %0, %1, %2, %3;"
+        : "=r"(word) : "r"(word), "r"(src_offset), "r"(first_lane));
+#endif
+    return word;
+}
+
+/**
+ * Warp synchronous shfl_down
+ */
+__device__ __forceinline__ 
+unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int last_lane, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
+        : "=r"(word) : "r"(word), "r"(src_offset), "r"(last_lane), "r"(member_mask));
+#else
+    asm volatile("shfl.down.b32 %0, %1, %2, %3;"
+        : "=r"(word) : "r"(word), "r"(src_offset), "r"(last_lane));
+#endif
+    return word;
+}
+
+/**
+ * Warp synchronous shfl_idx
+ */
+__device__ __forceinline__ 
+unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int last_lane, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
+        : "=r"(word) : "r"(word), "r"(src_lane), "r"(last_lane), "r"(member_mask));
+#else
+    asm volatile("shfl.idx.b32 %0, %1, %2, %3;"
+        : "=r"(word) : "r"(word), "r"(src_lane), "r"(last_lane));
+#endif
+    return word;
+}
+
+/**
+ * Floating point multiply. (Mantissa LSB rounds towards zero.)
+ */
+__device__ __forceinline__ float FMUL_RZ(float a, float b)
+{
+    float d;
+    asm ("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b));
+    return d;
+}
+
+
+/**
+ * Floating point multiply-add. (Mantissa LSB rounds towards zero.)
+ */
+__device__ __forceinline__ float FFMA_RZ(float a, float b, float c)
+{
+    float d;
+    asm ("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c));
+    return d;
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Terminates the calling thread
+ */
+__device__ __forceinline__ void ThreadExit() {
+    asm volatile("exit;");
+}    
+
+
+/**
+ * \brief  Abort execution and generate an interrupt to the host CPU
+ */
+__device__ __forceinline__ void ThreadTrap() {
+    asm volatile("trap;");
+}
+
+
+/**
+ * \brief Returns the row-major linear thread identifier for a multidimensional thread block
+ */
+__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
+{
+    return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) +
+            ((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) +
+            threadIdx.x;
+}
+
+
+/**
+ * \brief Returns the warp lane ID of the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneId()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%laneid;" : "=r"(ret) );
+    return ret;
+}
+
+
+/**
+ * \brief Returns the warp ID of the calling thread.  Warp ID is guaranteed to be unique among warps, but may not correspond to a zero-based ranking within the thread block.
+ */
+__device__ __forceinline__ unsigned int WarpId()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%warpid;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes less than the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskLt()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes less than or equal to the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskLe()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%lanemask_le;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes greater than the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskGt()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%lanemask_gt;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskGe()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%lanemask_ge;" : "=r"(ret) );
+    return ret;
+}
+
+/** @} */       // end group UtilPtx
+
+
+
+
+/**
+ * \brief Shuffle-up for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>-<tt>src_offset</tt></sub>.  For thread lanes \e i < src_offset, the thread's own \p input is returned to the thread. ![](shfl_up_logo.png)
+ * \ingroup WarpModule
+ *
+ * \par
+ * - Available only for SM3.0 or newer
+ *
+ * \par Snippet
+ * The code snippet below illustrates each thread obtaining a \p double value from the
+ * predecessor of its predecessor.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Obtain one input item per thread
+ *     double thread_data = ...
+ *
+ *     // Obtain item from two ranks below
+ *     double peer_data = ShuffleUp(thread_data, 2, 0, 0xffffffff);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
+ * The corresponding output \p peer_data will be <tt>{1.0, 2.0, 1.0, 2.0, 3.0, ..., 30.0}</tt>.
+ *
+ */
+template <typename T>
+__device__ __forceinline__ T ShuffleUp(
+    T               input,              ///< [in] The value to broadcast
+    int             src_offset,         ///< [in] The relative down-offset of the peer to read from
+    int             first_lane,         ///< [in] Index of first lane in segment (typically 0)
+    unsigned int    member_mask)        ///< [in] 32-bit mask of participating warp lanes
+{
+    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
+
+    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+ 
+    T               output;
+    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
+    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
+
+    unsigned int shuffle_word;
+    shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[0], src_offset, first_lane, member_mask);
+    output_alias[0] = shuffle_word;
+
+    #pragma unroll
+    for (int WORD = 1; WORD < WORDS; ++WORD)
+    {
+        shuffle_word       = SHFL_UP_SYNC((unsigned int)input_alias[WORD], src_offset, first_lane, member_mask);
+        output_alias[WORD] = shuffle_word;
+    }
+
+    return output;
+}
+
+
+/**
+ * \brief Shuffle-down for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>+<tt>src_offset</tt></sub>.  For thread lanes \e i >= WARP_THREADS, the thread's own \p input is returned to the thread.  ![](shfl_down_logo.png)
+ * \ingroup WarpModule
+ *
+ * \par
+ * - Available only for SM3.0 or newer
+ *
+ * \par Snippet
+ * The code snippet below illustrates each thread obtaining a \p double value from the
+ * successor of its successor.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Obtain one input item per thread
+ *     double thread_data = ...
+ *
+ *     // Obtain item from two ranks below
+ *     double peer_data = ShuffleDown(thread_data, 2, 31, 0xffffffff);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
+ * The corresponding output \p peer_data will be <tt>{3.0, 4.0, 5.0, 6.0, 7.0, ..., 32.0}</tt>.
+ *
+ */
+template <typename T>
+__device__ __forceinline__ T ShuffleDown(
+    T               input,              ///< [in] The value to broadcast
+    int             src_offset,         ///< [in] The relative up-offset of the peer to read from
+    int             last_lane,          ///< [in] Index of first lane in segment (typically 31)
+    unsigned int    member_mask)        ///< [in] 32-bit mask of participating warp lanes
+{
+    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
+
+    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+
+    T               output;
+    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
+    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
+
+    unsigned int shuffle_word;
+    shuffle_word    = SHFL_DOWN_SYNC((unsigned int)input_alias[0], src_offset, last_lane, member_mask);
+    output_alias[0] = shuffle_word;
+
+    #pragma unroll
+    for (int WORD = 1; WORD < WORDS; ++WORD)
+    {
+        shuffle_word       = SHFL_DOWN_SYNC((unsigned int)input_alias[WORD], src_offset, last_lane, member_mask);
+        output_alias[WORD] = shuffle_word;
+    }
+
+    return output;
+}
+
+
+/**
+ * \brief Shuffle-broadcast for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input
+ * contributed by <em>warp-lane</em><sub><tt>src_lane</tt></sub>.  For \p src_lane < 0 or \p src_lane >= WARP_THREADS,
+ * then the thread's own \p input is returned to the thread. ![](shfl_broadcast_logo.png)
+ *
+ * \ingroup WarpModule
+ *
+ * \par
+ * - Available only for SM3.0 or newer
+ *
+ * \par Snippet
+ * The code snippet below illustrates each thread obtaining a \p double value from <em>warp-lane</em><sub>0</sub>.
+ *
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Obtain one input item per thread
+ *     double thread_data = ...
+ *
+ *     // Obtain item from thread 0
+ *     double peer_data = ShuffleIndex(thread_data, 0, 32, 0xffffffff);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
+ * The corresponding output \p peer_data will be <tt>{1.0, 1.0, 1.0, 1.0, 1.0, ..., 1.0}</tt>.
+ *
+ */
+template <typename T>
+__device__ __forceinline__ T ShuffleIndex(
+    T               input,                  ///< [in] The value to broadcast
+    int             src_lane,               ///< [in] Which warp lane is to do the broadcasting
+    int             logical_warp_threads,   ///< [in] Number of threads per logical warp
+    unsigned int    member_mask)            ///< [in] 32-bit mask of participating warp lanes
+{
+    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
+
+    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+
+    T               output;
+    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
+    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
+
+    unsigned int shuffle_word;
+    shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[0],
+                                 src_lane,
+                                 logical_warp_threads - 1,
+                                 member_mask);
+
+    output_alias[0] = shuffle_word;
+
+    #pragma unroll
+    for (int WORD = 1; WORD < WORDS; ++WORD)
+    {
+        shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[WORD],
+                                     src_lane,
+                                     logical_warp_threads - 1,
+                                     member_mask);
+
+        output_alias[WORD] = shuffle_word;
+    }
+
+    return output;
+}
+
+
+
+/**
+ * Compute a 32b mask of threads having the same least-significant
+ * LABEL_BITS of \p label as the calling thread.
+ */
+template <int LABEL_BITS>
+inline __device__ unsigned int MatchAny(unsigned int label)
+{
+    unsigned int retval;
+
+    // Extract masks of common threads for each bit
+    #pragma unroll
+    for (int BIT = 0; BIT < LABEL_BITS; ++BIT)
+    {
+        unsigned int mask;
+        unsigned int current_bit = 1 << BIT;
+        asm ("{\n"
+            "    .reg .pred p;\n"
+            "    and.b32 %0, %1, %2;"
+            "    setp.eq.u32 p, %0, %2;\n"
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+            "    vote.ballot.sync.b32 %0, p, 0xffffffff;\n"
+#else
+            "    vote.ballot.b32 %0, p;\n"
+#endif
+            "    @!p not.b32 %0, %0;\n"
+            "}\n" : "=r"(mask) : "r"(label), "r"(current_bit));
+
+        // Remove peers who differ
+        retval = (BIT == 0) ? mask : retval & mask;
+    }
+
+    return retval;
+
+//  // VOLTA match
+//    unsigned int retval;
+//    asm ("{\n"
+//         "    match.any.sync.b32 %0, %1, 0xffffffff;\n"
+//         "}\n" : "=r"(retval) : "r"(label));
+//    return retval;
+
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cpp/nvgraph/external/cub_semiring/util_type.cuh b/cpp/nvgraph/external/cub_semiring/util_type.cuh
new file mode 100644
index 00000000000..a1ea845ad04
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/util_type.cuh
@@ -0,0 +1,1452 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Common type manipulation (metaprogramming) utilities
+ */
+
+#pragma once
+
+#include <iostream>
+#include <limits>
+#include <cfloat>
+
+#include "util_macro.cuh"
+#include "util_arch.cuh"
+#include "util_namespace.cuh"
+
+#include "cuComplex.h"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+
+
+/******************************************************************************
+ * Type equality
+ ******************************************************************************/
+
+/**
+ * \brief Type selection (<tt>IF ? ThenType : ElseType</tt>)
+ */
+template <bool IF, typename ThenType, typename ElseType>
+struct If
+{
+    /// Conditional type result
+    typedef ThenType Type;      // true
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename ThenType, typename ElseType>
+struct If<false, ThenType, ElseType>
+{
+    typedef ElseType Type;      // false
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * Conditional types
+ ******************************************************************************/
+
+/**
+ * \brief Type equality test
+ */
+template <typename A, typename B>
+struct Equals
+{
+    enum {
+        VALUE = 0,
+        NEGATE = 1
+    };
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename A>
+struct Equals <A, A>
+{
+    enum {
+        VALUE = 1,
+        NEGATE = 0
+    };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * Static math
+ ******************************************************************************/
+
+/**
+ * \brief Statically determine log2(N), rounded up.
+ *
+ * For example:
+ *     Log2<8>::VALUE   // 3
+ *     Log2<3>::VALUE   // 2
+ */
+template <int N, int CURRENT_VAL = N, int COUNT = 0>
+struct Log2
+{
+    /// Static logarithm value
+    enum { VALUE = Log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE };         // Inductive case
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <int N, int COUNT>
+struct Log2<N, 0, COUNT>
+{
+    enum {VALUE = (1 << (COUNT - 1) < N) ?                                  // Base case
+        COUNT :
+        COUNT - 1 };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/**
+ * \brief Statically determine if N is a power-of-two
+ */
+template <int N>
+struct PowerOfTwo
+{
+    enum { VALUE = ((N & (N - 1)) == 0) };
+};
+
+
+
+/******************************************************************************
+ * Pointer vs. iterator detection
+ ******************************************************************************/
+
+/**
+ * \brief Pointer vs. iterator
+ */
+template <typename Tp>
+struct IsPointer
+{
+    enum { VALUE = 0 };
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename Tp>
+struct IsPointer<Tp*>
+{
+    enum { VALUE = 1 };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * Qualifier detection
+ ******************************************************************************/
+
+/**
+ * \brief Volatile modifier test
+ */
+template <typename Tp>
+struct IsVolatile
+{
+    enum { VALUE = 0 };
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename Tp>
+struct IsVolatile<Tp volatile>
+{
+    enum { VALUE = 1 };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * Qualifier removal
+ ******************************************************************************/
+
+/**
+ * \brief Removes \p const and \p volatile qualifiers from type \p Tp.
+ *
+ * For example:
+ *     <tt>typename RemoveQualifiers<volatile int>::Type         // int;</tt>
+ */
+template <typename Tp, typename Up = Tp>
+struct RemoveQualifiers
+{
+    /// Type without \p const and \p volatile qualifiers
+    typedef Up Type;
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename Tp, typename Up>
+struct RemoveQualifiers<Tp, volatile Up>
+{
+    typedef Up Type;
+};
+
+template <typename Tp, typename Up>
+struct RemoveQualifiers<Tp, const Up>
+{
+    typedef Up Type;
+};
+
+template <typename Tp, typename Up>
+struct RemoveQualifiers<Tp, const volatile Up>
+{
+    typedef Up Type;
+};
+
+
+/******************************************************************************
+ * Marker types
+ ******************************************************************************/
+
+/**
+ * \brief A simple "NULL" marker type
+ */
+struct NullType
+{
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    template <typename T>
+    __host__ __device__ __forceinline__ NullType& operator =(const T&) { return *this; }
+
+    __host__ __device__ __forceinline__ bool operator ==(const NullType&) { return true; }
+
+    __host__ __device__ __forceinline__ bool operator !=(const NullType&) { return false; }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+
+
+/**
+ * \brief Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static call dispatch based on constant integral values)
+ */
+template <int A>
+struct Int2Type
+{
+   enum {VALUE = A};
+};
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/******************************************************************************
+ * Size and alignment
+ ******************************************************************************/
+
+/// Structure alignment
+template <typename T>
+struct AlignBytes
+{
+    struct Pad
+    {
+        T       val;
+        char    byte;
+    };
+
+    enum
+    {
+        /// The "true CUDA" alignment of T in bytes
+        ALIGN_BYTES = sizeof(Pad) - sizeof(T)
+    };
+
+    /// The "truly aligned" type
+    typedef T Type;
+};
+
+// Specializations where host C++ compilers (e.g., 32-bit Windows) may disagree
+// with device C++ compilers (EDG) on types passed as template parameters through
+// kernel functions
+
+#define __CUB_ALIGN_BYTES(t, b)         \
+    template <> struct AlignBytes<t>    \
+    { enum { ALIGN_BYTES = b }; typedef __align__(b) t Type; };
+
+__CUB_ALIGN_BYTES(short4, 8)
+__CUB_ALIGN_BYTES(ushort4, 8)
+__CUB_ALIGN_BYTES(int2, 8)
+__CUB_ALIGN_BYTES(uint2, 8)
+__CUB_ALIGN_BYTES(long long, 8)
+__CUB_ALIGN_BYTES(unsigned long long, 8)
+__CUB_ALIGN_BYTES(float2, 8)
+__CUB_ALIGN_BYTES(double, 8)
+#ifdef _WIN32
+    __CUB_ALIGN_BYTES(long2, 8)
+    __CUB_ALIGN_BYTES(ulong2, 8)
+#else
+    __CUB_ALIGN_BYTES(long2, 16)
+    __CUB_ALIGN_BYTES(ulong2, 16)
+#endif
+__CUB_ALIGN_BYTES(int4, 16)
+__CUB_ALIGN_BYTES(uint4, 16)
+__CUB_ALIGN_BYTES(float4, 16)
+__CUB_ALIGN_BYTES(long4, 16)
+__CUB_ALIGN_BYTES(ulong4, 16)
+__CUB_ALIGN_BYTES(longlong2, 16)
+__CUB_ALIGN_BYTES(ulonglong2, 16)
+__CUB_ALIGN_BYTES(double2, 16)
+__CUB_ALIGN_BYTES(longlong4, 16)
+__CUB_ALIGN_BYTES(ulonglong4, 16)
+__CUB_ALIGN_BYTES(double4, 16)
+
+template <typename T> struct AlignBytes<volatile T> : AlignBytes<T> {};
+template <typename T> struct AlignBytes<const T> : AlignBytes<T> {};
+template <typename T> struct AlignBytes<const volatile T> : AlignBytes<T> {};
+
+
+/// Unit-words of data movement
+template <typename T>
+struct UnitWord
+{
+    enum {
+        ALIGN_BYTES = AlignBytes<T>::ALIGN_BYTES
+    };
+
+    template <typename Unit>
+    struct IsMultiple
+    {
+        enum {
+            UNIT_ALIGN_BYTES    = AlignBytes<Unit>::ALIGN_BYTES,
+            IS_MULTIPLE         = (sizeof(T) % sizeof(Unit) == 0) && (ALIGN_BYTES % UNIT_ALIGN_BYTES == 0)
+        };
+    };
+
+    /// Biggest shuffle word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename If<IsMultiple<int>::IS_MULTIPLE,
+        unsigned int,
+        typename If<IsMultiple<short>::IS_MULTIPLE,
+            unsigned short,
+            unsigned char>::Type>::Type         ShuffleWord;
+
+    /// Biggest volatile word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename If<IsMultiple<long long>::IS_MULTIPLE,
+        unsigned long long,
+        ShuffleWord>::Type                      VolatileWord;
+
+    /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename If<IsMultiple<longlong2>::IS_MULTIPLE,
+        ulonglong2,
+        VolatileWord>::Type                     DeviceWord;
+
+    /// Biggest texture reference word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename If<IsMultiple<int4>::IS_MULTIPLE,
+        uint4,
+        typename If<IsMultiple<int2>::IS_MULTIPLE,
+            uint2,
+            ShuffleWord>::Type>::Type           TextureWord;
+};
+
+
+// float2 specialization workaround (for SM10-SM13)
+template <>
+struct UnitWord <float2>
+{
+    typedef int         ShuffleWord;
+#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
+    typedef float       VolatileWord;
+    typedef uint2       DeviceWord;
+#else
+    typedef unsigned long long   VolatileWord;
+    typedef unsigned long long   DeviceWord;
+#endif
+    typedef float2      TextureWord;
+};
+
+// float4 specialization workaround (for SM10-SM13)
+template <>
+struct UnitWord <float4>
+{
+    typedef int         ShuffleWord;
+#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
+    typedef float               VolatileWord;
+    typedef uint4               DeviceWord;
+#else
+    typedef unsigned long long  VolatileWord;
+    typedef ulonglong2          DeviceWord;
+#endif
+    typedef float4              TextureWord;
+};
+
+
+// char2 specialization workaround (for SM10-SM13)
+template <>
+struct UnitWord <char2>
+{
+    typedef unsigned short      ShuffleWord;
+#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
+    typedef unsigned short      VolatileWord;
+    typedef short               DeviceWord;
+#else
+    typedef unsigned short      VolatileWord;
+    typedef unsigned short      DeviceWord;
+#endif
+    typedef unsigned short      TextureWord;
+};
+
+
+template <typename T> struct UnitWord<volatile T> : UnitWord<T> {};
+template <typename T> struct UnitWord<const T> : UnitWord<T> {};
+template <typename T> struct UnitWord<const volatile T> : UnitWord<T> {};
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * Vector type inference utilities.
+ ******************************************************************************/
+
+/**
+ * \brief Exposes a member typedef \p Type that names the corresponding CUDA vector type if one exists.  Otherwise \p Type refers to the CubVector structure itself, which will wrap the corresponding \p x, \p y, etc. vector fields.
+ */
+template <typename T, int vec_elements> struct CubVector;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+enum
+{
+    /// The maximum number of elements in CUDA vector types
+    MAX_VEC_ELEMENTS = 4,
+};
+
+
+/**
+ * Generic vector-1 type
+ */
+template <typename T>
+struct CubVector<T, 1>
+{
+    T x;
+
+    typedef T BaseType;
+    typedef CubVector<T, 1> Type;
+};
+
+/**
+ * Generic vector-2 type
+ */
+template <typename T>
+struct CubVector<T, 2>
+{
+    T x;
+    T y;
+
+    typedef T BaseType;
+    typedef CubVector<T, 2> Type;
+};
+
+/**
+ * Generic vector-3 type
+ */
+template <typename T>
+struct CubVector<T, 3>
+{
+    T x;
+    T y;
+    T z;
+
+    typedef T BaseType;
+    typedef CubVector<T, 3> Type;
+};
+
+/**
+ * Generic vector-4 type
+ */
+template <typename T>
+struct CubVector<T, 4>
+{
+    T x;
+    T y;
+    T z;
+    T w;
+
+    typedef T BaseType;
+    typedef CubVector<T, 4> Type;
+};
+
+
+/**
+ * Macro for expanding partially-specialized built-in vector types
+ */
+#define CUB_DEFINE_VECTOR_TYPE(base_type,short_type)                                                    \
+                                                                                                        \
+    template<> struct CubVector<base_type, 1> : short_type##1                                           \
+    {                                                                                                   \
+      typedef base_type       BaseType;                                                                 \
+      typedef short_type##1   Type;                                                                     \
+      __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {           \
+          CubVector retval;                                                                             \
+          retval.x = x + other.x;                                                                       \
+          return retval;                                                                                \
+      }                                                                                                 \
+      __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {           \
+          CubVector retval;                                                                             \
+          retval.x = x - other.x;                                                                       \
+          return retval;                                                                                \
+      }                                                                                                 \
+    };                                                                                                  \
+                                                                                                        \
+    template<> struct CubVector<base_type, 2> : short_type##2                                           \
+    {                                                                                                   \
+        typedef base_type       BaseType;                                                               \
+        typedef short_type##2   Type;                                                                   \
+        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x + other.x;                                                                     \
+            retval.y = y + other.y;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x - other.x;                                                                     \
+            retval.y = y - other.y;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+    };                                                                                                  \
+                                                                                                        \
+    template<> struct CubVector<base_type, 3> : short_type##3                                           \
+    {                                                                                                   \
+        typedef base_type       BaseType;                                                               \
+        typedef short_type##3   Type;                                                                   \
+        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x + other.x;                                                                     \
+            retval.y = y + other.y;                                                                     \
+            retval.z = z + other.z;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x - other.x;                                                                     \
+            retval.y = y - other.y;                                                                     \
+            retval.z = z - other.z;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+    };                                                                                                  \
+                                                                                                        \
+    template<> struct CubVector<base_type, 4> : short_type##4                                           \
+    {                                                                                                   \
+        typedef base_type       BaseType;                                                               \
+        typedef short_type##4   Type;                                                                   \
+        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x + other.x;                                                                     \
+            retval.y = y + other.y;                                                                     \
+            retval.z = z + other.z;                                                                     \
+            retval.w = w + other.w;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x - other.x;                                                                     \
+            retval.y = y - other.y;                                                                     \
+            retval.z = z - other.z;                                                                     \
+            retval.w = w - other.w;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+    };
+
+
+
+// Expand CUDA vector types for built-in primitives
+CUB_DEFINE_VECTOR_TYPE(char,               char)
+CUB_DEFINE_VECTOR_TYPE(signed char,        char)
+CUB_DEFINE_VECTOR_TYPE(short,              short)
+CUB_DEFINE_VECTOR_TYPE(int,                int)
+CUB_DEFINE_VECTOR_TYPE(long,               long)
+CUB_DEFINE_VECTOR_TYPE(long long,          longlong)
+CUB_DEFINE_VECTOR_TYPE(unsigned char,      uchar)
+CUB_DEFINE_VECTOR_TYPE(unsigned short,     ushort)
+CUB_DEFINE_VECTOR_TYPE(unsigned int,       uint)
+CUB_DEFINE_VECTOR_TYPE(unsigned long,      ulong)
+CUB_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong)
+CUB_DEFINE_VECTOR_TYPE(float,              float)
+CUB_DEFINE_VECTOR_TYPE(double,             double)
+CUB_DEFINE_VECTOR_TYPE(bool,               uchar)
+
+// Undefine macros
+#undef CUB_DEFINE_VECTOR_TYPE
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * Wrapper types
+ ******************************************************************************/
+
+/**
+ * \brief A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions
+ */
+template <typename T>
+struct Uninitialized
+{
+    /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename UnitWord<T>::DeviceWord DeviceWord;
+
+    enum
+    {
+        WORDS = sizeof(T) / sizeof(DeviceWord)
+    };
+
+    /// Backing storage
+    DeviceWord storage[WORDS];
+
+    /// Alias
+    __host__ __device__ __forceinline__ T& Alias()
+    {
+        return reinterpret_cast<T&>(*this);
+    }
+};
+
+
+/**
+ * \brief A key identifier paired with a corresponding value
+ */
+template <
+    typename    _Key,
+    typename    _Value
+#if defined(_WIN32) && !defined(_WIN64)
+    , bool KeyIsLT = (AlignBytes<_Key>::ALIGN_BYTES < AlignBytes<_Value>::ALIGN_BYTES)
+    , bool ValIsLT = (AlignBytes<_Value>::ALIGN_BYTES < AlignBytes<_Key>::ALIGN_BYTES)
+#endif // #if defined(_WIN32) && !defined(_WIN64)
+    >
+struct KeyValuePair
+{
+    typedef _Key    Key;                ///< Key data type
+    typedef _Value  Value;              ///< Value data type
+
+    Key     key;                        ///< Item key
+    Value   value;                      ///< Item value
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
+
+    /// Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
+    {
+        return (value != b.value) || (key != b.key);
+    }
+};
+
+#if defined(_WIN32) && !defined(_WIN64)
+
+/**
+ * Win32 won't do 16B alignment.  This can present two problems for
+ * should-be-16B-aligned (but actually 8B aligned) built-in and intrinsics members:
+ * 1) If a smaller-aligned item were to be listed first, the host compiler places the
+ *    should-be-16B item at too early an offset (and disagrees with device compiler)
+ * 2) Or, if a smaller-aligned item lists second, the host compiler gets the size
+ *    of the struct wrong (and disagrees with device compiler)
+ *
+ * So we put the larger-should-be-aligned item first, and explicitly pad the
+ * end of the struct
+ */
+
+/// Smaller key specialization
+template <typename K, typename V>
+struct KeyValuePair<K, V, true, false>
+{
+    typedef K Key;
+    typedef V Value;
+
+    typedef char Pad[AlignBytes<V>::ALIGN_BYTES - AlignBytes<K>::ALIGN_BYTES];
+
+    Value   value;  // Value has larger would-be alignment and goes first
+    Key     key;
+    Pad     pad;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
+
+    /// Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
+    {
+        return (value != b.value) || (key != b.key);
+    }
+};
+
+
+/// Smaller value specialization
+template <typename K, typename V>
+struct KeyValuePair<K, V, false, true>
+{
+    typedef K Key;
+    typedef V Value;
+
+    typedef char Pad[AlignBytes<K>::ALIGN_BYTES - AlignBytes<V>::ALIGN_BYTES];
+
+    Key     key;    // Key has larger would-be alignment and goes first
+    Value   value;
+    Pad     pad;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
+
+    /// Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
+    {
+        return (value != b.value) || (key != b.key);
+    }
+};
+
+#endif // #if defined(_WIN32) && !defined(_WIN64)
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/**
+ * \brief A wrapper for passing simple static arrays as kernel parameters
+ */
+template <typename T, int COUNT>
+struct ArrayWrapper
+{
+
+    /// Statically-sized array of type \p T
+    T array[COUNT];
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ArrayWrapper() {}
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Double-buffer storage wrapper for multi-pass stream transformations that require more than one storage array for streaming intermediate results back and forth.
+ *
+ * Many multi-pass computations require a pair of "ping-pong" storage
+ * buffers (e.g., one for reading from and the other for writing to, and then
+ * vice-versa for the subsequent pass).  This structure wraps a set of device
+ * buffers and a "selector" member to track which is "current".
+ */
+template <typename T>
+struct DoubleBuffer
+{
+    /// Pair of device buffer pointers
+    T *d_buffers[2];
+
+    ///  Selector into \p d_buffers (i.e., the active/valid buffer)
+    int selector;
+
+    /// \brief Constructor
+    __host__ __device__ __forceinline__ DoubleBuffer()
+    {
+        selector = 0;
+        d_buffers[0] = NULL;
+        d_buffers[1] = NULL;
+    }
+
+    /// \brief Constructor
+    __host__ __device__ __forceinline__ DoubleBuffer(
+        T *d_current,         ///< The currently valid buffer
+        T *d_alternate)       ///< Alternate storage buffer of the same size as \p d_current
+    {
+        selector = 0;
+        d_buffers[0] = d_current;
+        d_buffers[1] = d_alternate;
+    }
+
+    /// \brief Return pointer to the currently valid buffer
+    __host__ __device__ __forceinline__ T* Current() { return d_buffers[selector]; }
+
+    /// \brief Return pointer to the currently invalid buffer
+    __host__ __device__ __forceinline__ T* Alternate() { return d_buffers[selector ^ 1]; }
+
+};
+
+
+
+/******************************************************************************
+ * Typedef-detection
+ ******************************************************************************/
+
+
+/**
+ * \brief Defines a structure \p detector_name that is templated on type \p T.  The \p detector_name struct exposes a constant member \p VALUE indicating whether or not parameter \p T exposes a nested type \p nested_type_name
+ */
+#define CUB_DEFINE_DETECT_NESTED_TYPE(detector_name, nested_type_name)  \
+    template <typename T>                                               \
+    struct detector_name                                                \
+    {                                                                   \
+        template <typename C>                                           \
+        static char& test(typename C::nested_type_name*);               \
+        template <typename>                                             \
+        static int& test(...);                                          \
+        enum                                                            \
+        {                                                               \
+            VALUE = sizeof(test<T>(0)) < sizeof(int)                    \
+        };                                                              \
+    };
+
+
+
+/******************************************************************************
+ * Simple enable-if (similar to Boost)
+ ******************************************************************************/
+
+/**
+ * \brief Simple enable-if (similar to Boost)
+ */
+template <bool Condition, class T = void>
+struct EnableIf
+{
+    /// Enable-if type for SFINAE dummy variables
+    typedef T Type;
+};
+
+
+template <class T>
+struct EnableIf<false, T> {};
+
+
+
+/******************************************************************************
+ * Typedef-detection
+ ******************************************************************************/
+
+/**
+ * \brief Determine whether or not BinaryOp's functor is of the form <tt>bool operator()(const T& a, const T&b)</tt> or <tt>bool operator()(const T& a, const T&b, unsigned int idx)</tt>
+ */
+template <typename T, typename BinaryOp>
+struct BinaryOpHasIdxParam
+{
+private:
+/*
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx) const>  struct SFINAE1 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx)>        struct SFINAE2 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx) const>                struct SFINAE3 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx)>                      struct SFINAE4 {};
+*/
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx) const>           struct SFINAE5 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx)>                 struct SFINAE6 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx) const>                         struct SFINAE7 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx)>                               struct SFINAE8 {};
+/*
+    template <typename BinaryOpT> static char Test(SFINAE1<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE2<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE3<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE4<BinaryOpT, &BinaryOpT::operator()> *);
+*/
+    template <typename BinaryOpT> static char Test(SFINAE5<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE6<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE7<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE8<BinaryOpT, &BinaryOpT::operator()> *);
+
+    template <typename BinaryOpT> static int Test(...);
+
+public:
+
+    /// Whether the functor BinaryOp has a third <tt>unsigned int</tt> index param
+    static const bool HAS_PARAM = sizeof(Test<BinaryOp>(NULL)) == sizeof(char);
+};
+
+
+
+
+/******************************************************************************
+ * Simple type traits utilities.
+ *
+ * For example:
+ *     Traits<int>::CATEGORY             // SIGNED_INTEGER
+ *     Traits<NullType>::NULL_TYPE       // true
+ *     Traits<uint4>::CATEGORY           // NOT_A_NUMBER
+ *     Traits<uint4>::PRIMITIVE;         // false
+ *
+ ******************************************************************************/
+
+/**
+ * \brief Basic type traits categories
+ */
+enum Category
+{
+    NOT_A_NUMBER,
+    SIGNED_INTEGER,
+    UNSIGNED_INTEGER,
+    FLOATING_POINT
+};
+
+
+/**
+ * \brief Basic type traits
+ */
+template <Category _CATEGORY, bool _PRIMITIVE, bool _NULL_TYPE, typename _UnsignedBits, typename T>
+struct BaseTraits
+{
+    /// Category
+    static const Category CATEGORY      = _CATEGORY;
+    enum
+    {
+        PRIMITIVE       = _PRIMITIVE,
+        NULL_TYPE       = _NULL_TYPE,
+    };
+};
+
+
+/**
+ * Basic type traits (unsigned primitive specialization)
+ */
+template <typename _UnsignedBits, typename T>
+struct BaseTraits<UNSIGNED_INTEGER, true, false, _UnsignedBits, T>
+{
+    typedef _UnsignedBits       UnsignedBits;
+
+    static const Category       CATEGORY    = UNSIGNED_INTEGER;
+    static const UnsignedBits   LOWEST_KEY  = UnsignedBits(0);
+    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1);
+
+    enum
+    {
+        PRIMITIVE       = true,
+        NULL_TYPE       = false,
+    };
+
+
+    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
+    {
+        return key;
+    }
+
+    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
+    {
+        return key;
+    }
+
+    static __host__ __device__ __forceinline__ T Max()
+    {
+        UnsignedBits retval = MAX_KEY;
+        return reinterpret_cast<T&>(retval);
+    }
+
+    static __host__ __device__ __forceinline__ T Lowest()
+    {
+        UnsignedBits retval = LOWEST_KEY;
+        return reinterpret_cast<T&>(retval);
+    }
+};
+
+
+/**
+ * Basic type traits (signed primitive specialization)
+ */
+template <typename _UnsignedBits, typename T>
+struct BaseTraits<SIGNED_INTEGER, true, false, _UnsignedBits, T>
+{
+    typedef _UnsignedBits       UnsignedBits;
+
+    static const Category       CATEGORY    = SIGNED_INTEGER;
+    static const UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
+    static const UnsignedBits   LOWEST_KEY  = HIGH_BIT;
+    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
+
+    enum
+    {
+        PRIMITIVE       = true,
+        NULL_TYPE       = false,
+    };
+
+    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
+    {
+        return key ^ HIGH_BIT;
+    };
+
+    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
+    {
+        return key ^ HIGH_BIT;
+    };
+
+    static __host__ __device__ __forceinline__ T Max()
+    {
+        UnsignedBits retval = MAX_KEY;
+        return reinterpret_cast<T&>(retval);
+    }
+
+    static __host__ __device__ __forceinline__ T Lowest()
+    {
+        UnsignedBits retval = LOWEST_KEY;
+        return reinterpret_cast<T&>(retval);
+    }
+};
+
+template <typename _T>
+struct FpLimits;
+
+template <>
+struct FpLimits<float>
+{
+    static __host__ __device__ __forceinline__ float Max() {
+        return FLT_MAX;
+    }
+
+    static __host__ __device__ __forceinline__ float Lowest() {
+        return FLT_MAX * float(-1);
+    }
+};
+
+template <>
+struct FpLimits<double>
+{
+    static __host__ __device__ __forceinline__ double Max() {
+        return DBL_MAX;
+    }
+
+    static __host__ __device__ __forceinline__ double Lowest() {
+        return DBL_MAX  * double(-1);
+    }
+};
+
+template <typename _T>
+struct TypeConst;
+
+template <>
+struct TypeConst<cuComplex>
+{
+    static __host__ __device__ __forceinline__ cuComplex Zero()
+    {
+        return make_cuComplex(0.f, 0.f);
+    }
+    static __host__ __device__ __forceinline__ cuComplex One()
+    {
+        return make_cuComplex(1.f, 0.f);
+    }
+};
+
+template <>
+struct TypeConst<cuDoubleComplex>
+{
+    static __host__ __device__ __forceinline__ cuDoubleComplex Zero()
+    {
+        return make_cuDoubleComplex(0.f, 0.f);
+    }
+    static __host__ __device__ __forceinline__ cuDoubleComplex One()
+    {
+        return make_cuDoubleComplex(1.f, 0.f);
+    }
+};
+
+template <typename _T>
+struct TypeConst
+{
+    static __host__ __device__ __forceinline__ _T Zero()
+    {
+        return _T(0);
+    }
+    static __host__ __device__ __forceinline__ _T One()
+    {
+        return _T(1);
+    }
+};
+
+
+/**
+ * Basic type traits (fp primitive specialization)
+ */
+template <typename _UnsignedBits, typename T>
+struct BaseTraits<FLOATING_POINT, true, false, _UnsignedBits, T>
+{
+    typedef _UnsignedBits       UnsignedBits;
+
+    static const Category       CATEGORY    = FLOATING_POINT;
+    static const UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
+    static const UnsignedBits   LOWEST_KEY  = UnsignedBits(-1);
+    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
+
+    enum
+    {
+        PRIMITIVE       = true,
+        NULL_TYPE       = false,
+    };
+
+    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
+    {
+        UnsignedBits mask = (key & HIGH_BIT) ? UnsignedBits(-1) : HIGH_BIT;
+        return key ^ mask;
+    };
+
+    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
+    {
+        UnsignedBits mask = (key & HIGH_BIT) ? HIGH_BIT : UnsignedBits(-1);
+        return key ^ mask;
+    };
+
+    static __host__ __device__ __forceinline__ T Max() {
+        return FpLimits<T>::Max();
+    }
+
+    static __host__ __device__ __forceinline__ T Lowest() {
+        return FpLimits<T>::Lowest();
+    }
+};
+
+/**
+ * Basic type traits (fp complex primitive specialization)
+ */
+template <typename Unused, typename T>
+struct BaseTraits<FLOATING_POINT, false, false, Unused, T>
+{
+    typedef Unused       UnsignedBits;
+
+    static const Category       CATEGORY    = FLOATING_POINT;
+
+    enum
+    {
+        PRIMITIVE       = false,
+        NULL_TYPE       = false,
+    };
+};
+
+
+/**
+ * \brief Numeric type traits
+ */
+template <typename T> struct NumericTraits :            BaseTraits<NOT_A_NUMBER, false, false, T, T> {};
+
+template <> struct NumericTraits<NullType> :            BaseTraits<NOT_A_NUMBER, false, true, NullType, NullType> {};
+
+template <> struct NumericTraits<char> :                BaseTraits<(std::numeric_limits<char>::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char, char> {};
+template <> struct NumericTraits<signed char> :         BaseTraits<SIGNED_INTEGER, true, false, unsigned char, signed char> {};
+template <> struct NumericTraits<short> :               BaseTraits<SIGNED_INTEGER, true, false, unsigned short, short> {};
+template <> struct NumericTraits<int> :                 BaseTraits<SIGNED_INTEGER, true, false, unsigned int, int> {};
+template <> struct NumericTraits<long> :                BaseTraits<SIGNED_INTEGER, true, false, unsigned long, long> {};
+template <> struct NumericTraits<long long> :           BaseTraits<SIGNED_INTEGER, true, false, unsigned long long, long long> {};
+
+template <> struct NumericTraits<unsigned char> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned char, unsigned char> {};
+template <> struct NumericTraits<unsigned short> :      BaseTraits<UNSIGNED_INTEGER, true, false, unsigned short, unsigned short> {};
+template <> struct NumericTraits<unsigned int> :        BaseTraits<UNSIGNED_INTEGER, true, false, unsigned int, unsigned int> {};
+template <> struct NumericTraits<unsigned long> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long, unsigned long> {};
+template <> struct NumericTraits<unsigned long long> :  BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long long, unsigned long long> {};
+
+template <> struct NumericTraits<float> :               BaseTraits<FLOATING_POINT, true, false, unsigned int, float> {};
+template <> struct NumericTraits<double> :              BaseTraits<FLOATING_POINT, true, false, unsigned long long, double> {};
+template <> struct NumericTraits<cuComplex> :           BaseTraits<FLOATING_POINT, false, false, void, cuComplex> {};
+template <> struct NumericTraits<cuDoubleComplex> :     BaseTraits<FLOATING_POINT, false, false, void, cuDoubleComplex> {};
+
+template <> struct NumericTraits<bool> :                BaseTraits<UNSIGNED_INTEGER, true, false, typename UnitWord<bool>::VolatileWord, bool> {};
+
+
+
+/**
+ * \brief Type traits
+ */
+template <typename T>
+struct Traits : NumericTraits<typename RemoveQualifiers<T>::Type> {};
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/**
+ * \brief Semiring util
+ */
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+//@TODO: reuse cub 
+/*template <typename T>
+struct type_info;
+
+template <>
+struct type_info<double>
+{ 
+    static __host__ __device__ __forceinline__ double inf() { return DBL_MAX;}
+    static __host__ __device__ __forceinline__ double ninf() { return -DBL_MAX;}
+    // this is what we use as a tolerance in the algorithms, more precision than this is useless for CPU reference comparison
+    static __host__ __device__ __forceinline__ double tol() { return 1e-6; }
+};
+
+template <>
+struct type_info<float>
+{ 
+    static __host__ __device__ __forceinline__ float inf() {return FLT_MAX;}
+    static __host__ __device__ __forceinline__ float ninf() {return -FLT_MAX;}
+    static __host__ __device__ __forceinline__ float tol() {return 1e-4;}
+};
+
+
+template <>
+struct type_info<int>
+{ 
+    static __host__ __device__ __forceinline__ int inf() {return INT_MAX;}
+    static __host__ __device__ __forceinline__ int ninf() {return INT_MIN;}
+    static __host__ __device__ __forceinline__ int tol() {return 0;}
+};*/
+
+template<typename V>
+struct PlusTimesSemiring
+{
+    // enable with c++11
+    /*
+    static_assert(  std::is_same<float, typename std::remove_cv<V>::type>::value  ||
+                    std::is_same<double, typename std::remove_cv<T>::type>::value,
+                    "Graph value type is not supported by this semiring");
+    */
+
+    static __host__ __device__ __forceinline__ V plus_ident(){ return TypeConst<V>::Zero();}
+    static __host__ __device__ __forceinline__ V times_ident(){ return TypeConst<V>::One();}
+    static __host__ __device__ __forceinline__ V times_null(){ return TypeConst<V>::Zero();}
+
+
+    static __host__ __device__ __forceinline__ V plus(const V &arg0, const V &arg1)
+    {
+        return arg0 + arg1;
+    }
+    static __host__ __device__ __forceinline__ V times(const V &arg0, const V &arg1)
+    {
+        return arg0 * arg1;
+    }
+
+    // used in external algs
+    struct SumOp
+    {
+        /// Boolean sum operator, returns <tt>a + b</tt>
+        template <typename T>
+        __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+        {
+            return plus(a, b);
+        }
+    };
+
+    enum{
+        HAS_PLUS_ATOMICS = 1, // for cub fixup path deduction
+    };
+};
+
+template<typename V>
+struct MinPlusSemiring
+{
+    // enable with c++11
+    /*
+    static_assert(  std::is_same<float, typename std::remove_cv<V>::type>::value  ||
+                    std::is_same<double, typename std::remove_cv<T>::type>::value,
+                    "Graph value type is not supported by this semiring");
+    */
+
+    static __host__ __device__ __forceinline__ V plus_ident(){ return FpLimits<V>::Max();}
+    static __host__ __device__ __forceinline__ V times_ident(){ return TypeConst<V>::Zero();}
+    static __host__ __device__ __forceinline__ V times_null(){ return FpLimits<V>::Max();}
+
+
+    static __host__ __device__ __forceinline__ V plus(const V &arg0, const V &arg1)
+    {
+        return CUB_MIN(arg0, arg1);
+    }
+    static __host__ __device__ __forceinline__ V times(const V &arg0, const V &arg1)
+    {
+        return arg0 + arg1;
+    }
+
+    // used in external algs
+    struct SumOp
+    {
+        /// Boolean sum operator, returns <tt>a + b</tt>
+        template <typename T>
+        __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+        {
+            return plus(a, b);
+        }
+    };
+
+    enum{
+        HAS_PLUS_ATOMICS = 0, // for cub fixup path deduction
+    };
+};
+
+template<typename V>
+struct MaxMinSemiring
+{
+    // enable with c++11
+    /*
+    static_assert(  std::is_same<float, typename std::remove_cv<V>::type>::value  ||
+                    std::is_same<double, typename std::remove_cv<T>::type>::value,
+                    "Graph value type is not supported by this semiring");
+    */
+
+    static __host__ __device__ __forceinline__ V plus_ident(){ return FpLimits<V>::Lowest();}
+    static __host__ __device__ __forceinline__ V times_ident(){ return FpLimits<V>::Max();}
+    static __host__ __device__ __forceinline__ V times_null(){ return FpLimits<V>::Lowest();}
+
+
+    static __host__ __device__ __forceinline__ V plus(const V &arg0, const V &arg1)
+    {
+        return CUB_MAX(arg0, arg1);
+    }
+    static __host__ __device__ __forceinline__ V times(const V &arg0, const V &arg1)
+    {
+        return CUB_MIN(arg0, arg1);
+    }
+
+    // used in external algs
+    struct SumOp
+    {
+        /// Boolean sum operator, returns <tt>a + b</tt>
+        template <typename T>
+        __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+        {
+            return plus(a, b);
+        }
+    };
+
+    enum{
+        HAS_PLUS_ATOMICS = 0, // for cub fixup path deduction
+    };
+};
+
+template<typename V>
+struct OrAndBoolSemiring
+{
+    // enable with c++11
+    /*
+    static_assert(  std::is_same<float, typename std::remove_cv<V>::type>::value  ||
+                    std::is_same<double, typename std::remove_cv<T>::type>::value,
+                    "Graph value type is not supported by this semiring");
+    */
+
+    static __host__ __device__ __forceinline__ V plus_ident(){ return TypeConst<V>::Zero();}
+    static __host__ __device__ __forceinline__ V times_ident(){ return TypeConst<V>::One();}
+    static __host__ __device__ __forceinline__ V times_null(){ return TypeConst<V>::Zero();}
+
+
+    static __host__ __device__ __forceinline__ V plus(const V &arg0, const V &arg1)
+    {
+        return (bool) arg0 | (bool) arg1;
+    }
+    static __host__ __device__ __forceinline__ V times(const V &arg0, const V &arg1)
+    {
+        return (bool) arg0 & (bool) arg1;
+    }
+
+    // used in external algs
+    struct SumOp
+    {
+        /// Boolean sum operator, returns <tt>a + b</tt>
+        template <typename T>
+        __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+        {
+            return plus(a, b);
+        }
+    };
+
+    enum{
+        HAS_PLUS_ATOMICS = 0, // for cub fixup path deduction
+    };
+};
+
+template<typename V>
+struct LogPlusSemiring
+{
+    // enable with c++11
+    /*
+    static_assert(  std::is_same<float, typename std::remove_cv<V>::type>::value  ||
+                    std::is_same<double, typename std::remove_cv<T>::type>::value,
+                    "Graph value type is not supported by this semiring");
+    */
+
+    static __host__ __device__ __forceinline__ V plus_ident(){ return FpLimits<V>::Max();}
+    static __host__ __device__ __forceinline__ V times_ident(){ return TypeConst<V>::Zero();}
+    static __host__ __device__ __forceinline__ V times_null(){ return FpLimits<V>::Max();}
+
+
+    static __host__ __device__ __forceinline__ V plus(const V &arg0, const V &arg1)
+    {
+        return -log(exp(-arg0) + exp(-arg1));
+    }
+    static __host__ __device__ __forceinline__ V times(const V &arg0, const V &arg1)
+    {
+        return arg0 + arg1;
+    }
+
+    // used in external algs
+    struct SumOp
+    {
+        /// Boolean sum operator, returns <tt>a + b</tt>
+        template <typename T>
+        __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+        {
+            return plus(a, b);
+        }
+    };
+
+    enum{
+        HAS_PLUS_ATOMICS = 0, // for cub fixup path deduction
+    };
+};
+
+// used in external algs
+template <typename SR>
+struct SumOp
+{
+    /// Boolean sum operator, returns <tt>a + b</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return SR::plus(a, b);
+    }
+};
+#endif // DOXYGEN_SHOULD_SKIP_THIS 
+
+
+/** @} */       // end group UtilModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cpp/nvgraph/external/cub_semiring/warp/specializations/warp_reduce_shfl.cuh b/cpp/nvgraph/external/cub_semiring/warp/specializations/warp_reduce_shfl.cuh
new file mode 100644
index 00000000000..682a5bfedc2
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/warp/specializations/warp_reduce_shfl.cuh
@@ -0,0 +1,551 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../../thread/thread_operators.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_type.cuh"
+#include "../../util_macro.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ *
+ * LOGICAL_WARP_THREADS must be a power-of-two
+ */
+template <
+    typename    T,                      ///< Data type being reduced
+    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+struct WarpReduceShfl
+{
+    //---------------------------------------------------------------------
+    // Constants and type definitions
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// The number of warp reduction steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// Number of logical warps in a PTX warp
+        LOGICAL_WARPS = CUB_WARP_THREADS(PTX_ARCH) / LOGICAL_WARP_THREADS,
+    };
+
+    template <typename S>
+    struct IsInteger
+    {
+        enum {
+            ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange
+            IS_SMALL_UNSIGNED = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int))
+        };
+    };
+
+
+    // Creates a mask where the last thread in each logical warp is set
+    template <int WARP, int WARPS>
+    struct LastLaneMask
+    {
+        enum {
+            BASE_MASK   = 1 << (LOGICAL_WARP_THREADS - 1),
+            MASK        = (LastLaneMask<WARP + 1, WARPS>::MASK << LOGICAL_WARP_THREADS) | BASE_MASK,
+        };
+    };
+
+    // Creates a mask where the last thread in each logical warp is set
+    template <int WARP>
+    struct LastLaneMask<WARP, WARP>
+    {
+        enum {
+            MASK        = 1 << (LOGICAL_WARP_THREADS - 1),
+        };
+    };
+
+
+
+    /// Shared memory storage layout type
+    typedef NullType TempStorage;
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+
+    unsigned int lane_id;
+
+    unsigned int member_mask;
+
+    //---------------------------------------------------------------------
+    // Construction
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ WarpReduceShfl(
+        TempStorage &/*temp_storage*/)
+    :
+        lane_id(LaneId()),
+
+        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP) ?
+            0 : // arch-width subwarps need not be tiled within the arch-warp
+            ((lane_id / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Reduction steps
+    //---------------------------------------------------------------------
+
+    /// Reduction (specialized for summation across uint32 types)
+    __device__ __forceinline__ unsigned int ReduceStep(
+        unsigned int    input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*reduction_op*/,   ///< [in] Binary reduction operator
+        int             last_lane,          ///< [in] Index of last lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        unsigned int output;
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.down.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.u32 r0, r0, %4;"
+            "  mov.u32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(last_lane), "r"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 r0;"
+            "  .reg .pred p;"
+            "  shfl.down.b32 r0|p, %1, %2, %3;"
+            "  @p add.u32 r0, r0, %4;"
+            "  mov.u32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(last_lane), "r"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for summation across fp32 types)
+    __device__ __forceinline__ float ReduceStep(
+        float           input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*reduction_op*/,   ///< [in] Binary reduction operator
+        int             last_lane,          ///< [in] Index of last lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        float output;
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.down.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(last_lane), "f"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.down.b32 r0|p, %1, %2, %3;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(last_lane), "f"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for summation across unsigned long long types)
+    __device__ __forceinline__ unsigned long long ReduceStep(
+        unsigned long long  input,              ///< [in] Calling thread's input item.
+        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                 last_lane,          ///< [in] Index of last lane in segment
+        int                 offset)             ///< [in] Up-offset to pull from
+    {
+        unsigned long long output;
+
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
+            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
+            "  mov.b64 %0, {lo, hi};"
+            "  @p add.u64 %0, %0, %1;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(last_lane), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.down.b32 lo|p, lo, %2, %3;"
+            "  shfl.down.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 %0, {lo, hi};"
+            "  @p add.u64 %0, %0, %1;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(last_lane));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for summation across long long types)
+    __device__ __forceinline__ long long ReduceStep(
+        long long           input,              ///< [in] Calling thread's input item.
+        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                 last_lane,          ///< [in] Index of last lane in segment
+        int                 offset)             ///< [in] Up-offset to pull from
+    {
+        long long output;
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
+            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
+            "  mov.b64 %0, {lo, hi};"
+            "  @p add.s64 %0, %0, %1;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(last_lane), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.down.b32 lo|p, lo, %2, %3;"
+            "  shfl.down.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 %0, {lo, hi};"
+            "  @p add.s64 %0, %0, %1;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(last_lane));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for summation across double types)
+    __device__ __forceinline__ double ReduceStep(
+        double              input,              ///< [in] Calling thread's input item.
+        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                 last_lane,          ///< [in] Index of last lane in segment
+        int                 offset)             ///< [in] Up-offset to pull from
+    {
+        double output;
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  .reg .f64 r0;"
+            "  mov.b64 %0, %1;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
+            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.f64 %0, %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(last_lane), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  .reg .f64 r0;"
+            "  mov.b64 %0, %1;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.down.b32 lo|p, lo, %2, %3;"
+            "  shfl.down.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.f64 %0, %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(last_lane));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for swizzled ReduceByKeyOp<cub::Sum> across KeyValuePair<KeyT, ValueT> types)
+    template <typename ValueT, typename KeyT>
+    __device__ __forceinline__ KeyValuePair<KeyT, ValueT> ReduceStep(
+        KeyValuePair<KeyT, ValueT>                  input,              ///< [in] Calling thread's input item.
+        SwizzleScanOp<ReduceByKeyOp<cub::Sum> >     /*reduction_op*/,       ///< [in] Binary reduction operator
+        int                                         last_lane,          ///< [in] Index of last lane in segment
+        int                                         offset)             ///< [in] Up-offset to pull from
+    {
+        KeyValuePair<KeyT, ValueT> output;
+
+        KeyT other_key = ShuffleDown(input.key, offset, last_lane, member_mask);
+        
+        output.key = input.key;
+        output.value = ReduceStep(
+            input.value, 
+            cub::Sum(), 
+            last_lane, 
+            offset, 
+            Int2Type<IsInteger<ValueT>::IS_SMALL_UNSIGNED>());
+
+        if (input.key != other_key)
+            output.value = input.value;
+
+        return output;
+    }
+
+
+
+    /// Reduction (specialized for swizzled ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT, ValueT> types)
+    template <typename ValueT, typename OffsetT>
+    __device__ __forceinline__ KeyValuePair<OffsetT, ValueT> ReduceStep(
+        KeyValuePair<OffsetT, ValueT>                 input,              ///< [in] Calling thread's input item.
+        SwizzleScanOp<ReduceBySegmentOp<cub::Sum> >   /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                                           last_lane,          ///< [in] Index of last lane in segment
+        int                                           offset)             ///< [in] Up-offset to pull from
+    {
+        KeyValuePair<OffsetT, ValueT> output;
+
+        output.value = ReduceStep(input.value, cub::Sum(), last_lane, offset, Int2Type<IsInteger<ValueT>::IS_SMALL_UNSIGNED>());
+        output.key = ReduceStep(input.key, cub::Sum(), last_lane, offset, Int2Type<IsInteger<OffsetT>::IS_SMALL_UNSIGNED>());
+
+        if (input.key > 0)
+            output.value = input.value;
+
+        return output;
+    }
+
+
+    /// Reduction step (generic)
+    template <typename _T, typename ReductionOp>
+    __device__ __forceinline__ _T ReduceStep(
+        _T                  input,              ///< [in] Calling thread's input item.
+        ReductionOp         reduction_op,       ///< [in] Binary reduction operator
+        int                 last_lane,          ///< [in] Index of last lane in segment
+        int                 offset)             ///< [in] Up-offset to pull from
+    {
+        _T output = input;
+
+        _T temp = ShuffleDown(output, offset, last_lane, member_mask);
+
+        // Perform reduction op if valid
+        if (offset + lane_id <= last_lane)
+            output = reduction_op(input, temp);
+
+        return output;
+    }
+
+
+    /// Reduction step (specialized for small unsigned integers size 32b or less)
+    template <typename _T, typename ReductionOp>
+    __device__ __forceinline__ _T ReduceStep(
+        _T              input,                  ///< [in] Calling thread's input item.
+        ReductionOp     reduction_op,           ///< [in] Binary reduction operator
+        int             last_lane,              ///< [in] Index of last lane in segment
+        int             offset,                 ///< [in] Up-offset to pull from
+        Int2Type<true>  /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small unsigned integer
+    {
+        return ReduceStep(input, reduction_op, last_lane, offset);
+    }
+
+
+    /// Reduction step (specialized for types other than small unsigned integers size 32b or less)
+    template <typename _T, typename ReductionOp>
+    __device__ __forceinline__ _T ReduceStep(
+        _T              input,                  ///< [in] Calling thread's input item.
+        ReductionOp     reduction_op,           ///< [in] Binary reduction operator
+        int             last_lane,              ///< [in] Index of last lane in segment
+        int             offset,                 ///< [in] Up-offset to pull from
+        Int2Type<false> /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small unsigned integer
+    {
+        return ReduceStep(input, reduction_op, last_lane, offset);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Templated inclusive scan iteration
+    //---------------------------------------------------------------------
+
+    template <typename ReductionOp, int STEP>
+    __device__ __forceinline__ void ReduceStep(
+        T&              input,              ///< [in] Calling thread's input item.
+        ReductionOp     reduction_op,       ///< [in] Binary reduction operator
+        int             last_lane,          ///< [in] Index of last lane in segment
+        Int2Type<STEP>  /*step*/)
+    {
+        input = ReduceStep(input, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
+
+        ReduceStep(input, reduction_op, last_lane, Int2Type<STEP + 1>());
+    }
+
+    template <typename ReductionOp>
+    __device__ __forceinline__ void ReduceStep(
+        T&              /*input*/,              ///< [in] Calling thread's input item.
+        ReductionOp     /*reduction_op*/,       ///< [in] Binary reduction operator
+        int             /*last_lane*/,          ///< [in] Index of last lane in segment
+        Int2Type<STEPS> /*step*/)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Reduction operations
+    //---------------------------------------------------------------------
+
+    /// Reduction
+    template <
+        bool            ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
+        int             FOLDED_ITEMS_PER_LANE,  ///< Number of items folded into each lane
+        typename        ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T               input,                  ///< [in] Calling thread's input
+        int             folded_items_per_warp,  ///< [in] Total number of valid items folded into each logical warp
+        ReductionOp     reduction_op)           ///< [in] Binary reduction operator
+    {
+        // Get the lane of the first and last thread in the logical warp
+        int first_thread   = 0;
+        int last_thread    = LOGICAL_WARP_THREADS - 1;
+        if (!IS_ARCH_WARP)
+        {
+            first_thread = lane_id & (~(LOGICAL_WARP_THREADS - 1));
+            last_thread |= lane_id;
+        }
+
+        // Common case is FOLDED_ITEMS_PER_LANE = 1 (or a multiple of 32)
+        int lanes_with_valid_data = (folded_items_per_warp - 1) / FOLDED_ITEMS_PER_LANE;
+
+        // Get the last valid lane
+        int last_lane = (ALL_LANES_VALID) ?
+            last_thread :
+            CUB_MIN(last_thread, first_thread + lanes_with_valid_data);
+
+        T output = input;
+
+//        // Iterate reduction steps
+//        #pragma unroll
+//        for (int STEP = 0; STEP < STEPS; STEP++)
+//        {
+//            output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
+//        }
+
+        // Template-iterate reduction steps
+        ReduceStep(output, reduction_op, last_lane, Int2Type<0>());
+
+        return output;
+    }
+
+
+    /// Segmented reduction
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        FlagT,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,              ///< [in] Calling thread's input
+        FlagT           flag,               ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op)       ///< [in] Binary reduction operator
+    {
+        // Get the start flags for each thread in the warp.
+        int warp_flags = WARP_BALLOT(flag, member_mask);
+
+        // Convert to tail-segmented
+        if (HEAD_SEGMENTED)
+            warp_flags >>= 1;
+
+        // Mask in the last lanes of each logical warp
+        warp_flags |= LastLaneMask<1, LOGICAL_WARPS>::MASK;
+
+        // Mask out the bits below the current thread
+        warp_flags &= LaneMaskGe();
+
+        // Find the next set flag
+        int last_lane = __clz(__brev(warp_flags));
+
+        T output = input;
+
+//        // Iterate reduction steps
+//        #pragma unroll
+//        for (int STEP = 0; STEP < STEPS; STEP++)
+//        {
+//            output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
+//        }
+
+        // Template-iterate reduction steps
+        ReduceStep(output, reduction_op, last_lane, Int2Type<0>());
+
+        return output;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cpp/nvgraph/external/cub_semiring/warp/specializations/warp_reduce_smem.cuh b/cpp/nvgraph/external/cub_semiring/warp/specializations/warp_reduce_smem.cuh
new file mode 100644
index 00000000000..9ba8e94d12d
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/warp/specializations/warp_reduce_smem.cuh
@@ -0,0 +1,375 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../../thread/thread_operators.cuh"
+#include "../../thread/thread_load.cuh"
+#include "../../thread/thread_store.cuh"
+#include "../../util_type.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ */
+template <
+    typename    T,                      ///< Data type being reduced
+    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+struct WarpReduceSmem
+{
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Whether the logical warp size is a power-of-two
+        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of warp scan steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of threads in half a warp
+        HALF_WARP_THREADS = 1 << (STEPS - 1),
+
+        /// The number of shared memory elements per warp
+        WARP_SMEM_ELEMENTS =  LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
+
+        /// FlagT status (when not using ballot)
+        UNSET   = 0x0,  // Is initially unset
+        SET     = 0x1,  // Is initially set
+        SEEN    = 0x2,  // Has seen another head flag from a successor peer
+    };
+
+    /// Shared memory flag type
+    typedef unsigned char SmemFlag;
+
+    /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
+    struct _TempStorage
+    {
+        T           reduce[WARP_SMEM_ELEMENTS];
+        SmemFlag    flags[WARP_SMEM_ELEMENTS];
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    _TempStorage    &temp_storage;
+    unsigned int    lane_id;
+    unsigned int    member_mask;
+
+
+    /******************************************************************************
+     * Construction
+     ******************************************************************************/
+
+    /// Constructor
+    __device__ __forceinline__ WarpReduceSmem(
+        TempStorage     &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+
+        lane_id(IS_ARCH_WARP ?
+            LaneId() :
+            LaneId() % LOGICAL_WARP_THREADS),
+
+        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ?
+            0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp
+            ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
+    {}
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    //---------------------------------------------------------------------
+    // Regular reduction
+    //---------------------------------------------------------------------
+
+    /**
+     * Reduction step
+     */
+    template <
+        bool                ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
+        int                 FOLDED_ITEMS_PER_LANE,  ///< Number of items folded into each lane
+        typename            ReductionOp,
+        int                 STEP>
+    __device__ __forceinline__ T ReduceStep(
+        T                   input,                  ///< [in] Calling thread's input
+        int                 folded_items_per_warp,  ///< [in] Total number of valid items folded into each logical warp
+        ReductionOp         reduction_op,           ///< [in] Reduction operator
+        Int2Type<STEP>      /*step*/)
+    {
+        const int OFFSET = 1 << STEP;
+
+        // Share input through buffer
+        ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
+
+        WARP_SYNC(member_mask);
+
+        // Update input if peer_addend is in range
+        if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE < folded_items_per_warp))
+        {
+            T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
+            input = reduction_op(input, peer_addend);
+        }
+
+        WARP_SYNC(member_mask);
+
+        return ReduceStep<ALL_LANES_VALID, FOLDED_ITEMS_PER_LANE>(input, folded_items_per_warp, reduction_op, Int2Type<STEP + 1>());
+    }
+
+
+    /**
+     * Reduction step (terminate)
+     */
+    template <
+        bool                ALL_LANES_VALID,            ///< Whether all lanes in each warp are contributing a valid fold of items
+        int                 FOLDED_ITEMS_PER_LANE,      ///< Number of items folded into each lane
+        typename            ReductionOp>
+    __device__ __forceinline__ T ReduceStep(
+        T                   input,                      ///< [in] Calling thread's input
+        int                 /*folded_items_per_warp*/,  ///< [in] Total number of valid items folded into each logical warp
+        ReductionOp         /*reduction_op*/,           ///< [in] Reduction operator
+        Int2Type<STEPS>     /*step*/)
+    {
+        return input;
+    }
+
+
+    //---------------------------------------------------------------------
+    // Segmented reduction
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Ballot-based segmented reduce
+     */
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        FlagT,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,                  ///< [in] Calling thread's input
+        FlagT           flag,                   ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op,           ///< [in] Reduction operator
+        Int2Type<true>  /*has_ballot*/)         ///< [in] Marker type for whether the target arch has ballot functionality
+    {
+        // Get the start flags for each thread in the warp.
+        int warp_flags = WARP_BALLOT(flag, member_mask);
+
+        if (!HEAD_SEGMENTED)
+            warp_flags <<= 1;
+
+        // Keep bits above the current thread.
+        warp_flags &= LaneMaskGt();
+
+        // Accommodate packing of multiple logical warps in a single physical warp
+        if (!IS_ARCH_WARP)
+        {
+            warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS;
+        }
+
+        // Find next flag
+        int next_flag = __clz(__brev(warp_flags));
+
+        // Clip the next segment at the warp boundary if necessary
+        if (LOGICAL_WARP_THREADS != 32)
+            next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS);
+
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            const int OFFSET = 1 << STEP;
+
+            // Share input into buffer
+            ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
+
+            WARP_SYNC(member_mask);
+
+            // Update input if peer_addend is in range
+            if (OFFSET + lane_id < next_flag)
+            {
+                T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
+                input = reduction_op(input, peer_addend);
+            }
+
+            WARP_SYNC(member_mask);
+        }
+
+        return input;
+    }
+
+
+    /**
+     * Smem-based segmented reduce
+     */
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        FlagT,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,                  ///< [in] Calling thread's input
+        FlagT           flag,                   ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op,           ///< [in] Reduction operator
+        Int2Type<false> /*has_ballot*/)         ///< [in] Marker type for whether the target arch has ballot functionality
+    {
+        enum
+        {
+            UNSET   = 0x0,  // Is initially unset
+            SET     = 0x1,  // Is initially set
+            SEEN    = 0x2,  // Has seen another head flag from a successor peer
+        };
+
+        // Alias flags onto shared data storage
+        volatile SmemFlag *flag_storage = temp_storage.flags;
+
+        SmemFlag flag_status = (flag) ? SET : UNSET;
+
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            const int OFFSET = 1 << STEP;
+
+            // Share input through buffer
+            ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
+
+            WARP_SYNC(member_mask);
+
+            // Get peer from buffer
+            T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
+
+            WARP_SYNC(member_mask);
+
+            // Share flag through buffer
+            flag_storage[lane_id] = flag_status;
+
+            // Get peer flag from buffer
+            SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET];
+
+            // Update input if peer was in range
+            if (lane_id < LOGICAL_WARP_THREADS - OFFSET)
+            {
+                if (HEAD_SEGMENTED)
+                {
+                    // Head-segmented
+                    if ((flag_status & SEEN) == 0)
+                    {
+                        // Has not seen a more distant head flag
+                        if (peer_flag_status & SET)
+                        {
+                            // Has now seen a head flag
+                            flag_status |= SEEN;
+                        }
+                        else
+                        {
+                            // Peer is not a head flag: grab its count
+                            input = reduction_op(input, peer_addend);
+                        }
+
+                        // Update seen status to include that of peer
+                        flag_status |= (peer_flag_status & SEEN);
+                    }
+                }
+                else
+                {
+                    // Tail-segmented.  Simply propagate flag status
+                    if (!flag_status)
+                    {
+                        input = reduction_op(input, peer_addend);
+                        flag_status |= peer_flag_status;
+                    }
+
+                }
+            }
+        }
+
+        return input;
+    }
+
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+    /**
+     * Reduction
+     */
+    template <
+        bool                ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
+        int                 FOLDED_ITEMS_PER_LANE,  ///< Number of items folded into each lane
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,                  ///< [in] Calling thread's input
+        int                 folded_items_per_warp,  ///< [in] Total number of valid items folded into each logical warp
+        ReductionOp         reduction_op)           ///< [in] Reduction operator
+    {
+        return ReduceStep<ALL_LANES_VALID, FOLDED_ITEMS_PER_LANE>(input, folded_items_per_warp, reduction_op, Int2Type<0>());
+    }
+
+
+    /**
+     * Segmented reduction
+     */
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        FlagT,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,              ///< [in] Calling thread's input
+        FlagT            flag,               ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op)       ///< [in] Reduction operator
+    {
+        return SegmentedReduce<HEAD_SEGMENTED>(input, flag, reduction_op, Int2Type<(PTX_ARCH >= 200)>());
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cpp/nvgraph/external/cub_semiring/warp/specializations/warp_scan_shfl.cuh b/cpp/nvgraph/external/cub_semiring/warp/specializations/warp_scan_shfl.cuh
new file mode 100644
index 00000000000..f0deb8ddefc
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/warp/specializations/warp_scan_shfl.cuh
@@ -0,0 +1,656 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../../thread/thread_operators.cuh"
+#include "../../util_type.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ *
+ * LOGICAL_WARP_THREADS must be a power-of-two
+ */
+template <
+    typename    T,                      ///< Data type being scanned
+    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+struct WarpScanShfl
+{
+    //---------------------------------------------------------------------
+    // Constants and type definitions
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// The number of warp scan steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+        SHFL_C = ((0xFFFFFFFFU << STEPS) & 31) << 8,
+    };
+
+    template <typename S>
+    struct IntegerTraits
+    {
+        enum {
+            ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange
+            IS_SMALL_UNSIGNED = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int))
+        };
+    };
+
+    /// Shared memory storage layout type
+    struct TempStorage {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    unsigned int lane_id;
+
+    unsigned int member_mask;
+
+    //---------------------------------------------------------------------
+    // Construction
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ WarpScanShfl(
+        TempStorage &/*temp_storage*/)
+    :
+        lane_id(LaneId()),
+
+        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP) ?
+            0 : // arch-width subwarps need not be tiled within the arch-warp
+            ((lane_id / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scan steps
+    //---------------------------------------------------------------------
+
+    /// Inclusive prefix scan step (specialized for summation across int32 types)
+    __device__ __forceinline__ int InclusiveScanStep(
+        int             input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        int output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .s32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.s32 r0, r0, %4;"
+            "  mov.s32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .s32 r0;"
+            "  .reg .pred p;"
+            "  shfl.up.b32 r0|p, %1, %2, %3;"
+            "  @p add.s32 r0, r0, %4;"
+            "  mov.s32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
+#endif
+
+        return output;
+    }
+
+    /// Inclusive prefix scan step (specialized for summation across uint32 types)
+    __device__ __forceinline__ unsigned int InclusiveScanStep(
+        unsigned int    input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        unsigned int output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.u32 r0, r0, %4;"
+            "  mov.u32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 r0;"
+            "  .reg .pred p;"
+            "  shfl.up.b32 r0|p, %1, %2, %3;"
+            "  @p add.u32 r0, r0, %4;"
+            "  mov.u32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for summation across fp32 types)
+    __device__ __forceinline__ float InclusiveScanStep(
+        float           input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        float output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.up.b32 r0|p, %1, %2, %3;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for summation across unsigned long long types)
+    __device__ __forceinline__ unsigned long long InclusiveScanStep(
+        unsigned long long  input,              ///< [in] Calling thread's input item.
+        cub::Sum            /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        unsigned long long output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u64 r0;"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
+            "  shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.u64 r0, r0, %4;"
+            "  mov.u64 %0, r0;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u64 r0;"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.up.b32 lo|p, lo, %2, %3;"
+            "  shfl.up.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.u64 r0, r0, %4;"
+            "  mov.u64 %0, r0;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for summation across long long types)
+    __device__ __forceinline__ long long InclusiveScanStep(
+        long long       input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        long long output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .s64 r0;"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
+            "  shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.s64 r0, r0, %4;"
+            "  mov.s64 %0, r0;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .s64 r0;"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.up.b32 lo|p, lo, %2, %3;"
+            "  shfl.up.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.s64 r0, r0, %4;"
+            "  mov.s64 %0, r0;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for summation across fp64 types)
+    __device__ __forceinline__ double InclusiveScanStep(
+        double          input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        double output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  .reg .f64 r0;"
+            "  mov.b64 %0, %1;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.up.b32 lo|p, lo, %2, %3, %4;"
+            "  shfl.sync.up.b32 hi|p, hi, %2, %3, %4;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.f64 %0, %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  .reg .f64 r0;"
+            "  mov.b64 %0, %1;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.up.b32 lo|p, lo, %2, %3;"
+            "  shfl.up.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.f64 %0, %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c));
+#endif
+
+        return output;
+    }
+
+
+/*
+    /// Inclusive prefix scan (specialized for ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT, Value> types)
+    template <typename Value, typename OffsetT>
+    __device__ __forceinline__ KeyValuePair<OffsetT, Value>InclusiveScanStep(
+        KeyValuePair<OffsetT, Value>    input,              ///< [in] Calling thread's input item.
+        ReduceBySegmentOp<cub::Sum>     scan_op,            ///< [in] Binary scan operator
+        int                             first_lane,         ///< [in] Index of first lane in segment
+        int                             offset)             ///< [in] Up-offset to pull from
+    {
+        KeyValuePair<OffsetT, Value> output;
+
+        output.value = InclusiveScanStep(input.value, cub::Sum(), first_lane, offset, Int2Type<IntegerTraits<Value>::IS_SMALL_UNSIGNED>());
+        output.key = InclusiveScanStep(input.key, cub::Sum(), first_lane, offset, Int2Type<IntegerTraits<OffsetT>::IS_SMALL_UNSIGNED>());
+
+        if (input.key > 0)
+            output.value = input.value;
+
+        return output;
+    }
+*/
+
+    /// Inclusive prefix scan step (generic)
+    template <typename _T, typename ScanOpT>
+    __device__ __forceinline__ _T InclusiveScanStep(
+        _T              input,              ///< [in] Calling thread's input item.
+        ScanOpT          scan_op,            ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        _T temp = ShuffleUp(input, offset, first_lane, member_mask);
+
+        // Perform scan op if from a valid peer
+        _T output = scan_op(temp, input);
+        if (static_cast<int>(lane_id) < first_lane + offset)
+            output = input;
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for small integers size 32b or less)
+    template <typename _T, typename ScanOpT>
+    __device__ __forceinline__ _T InclusiveScanStep(
+        _T              input,              ///< [in] Calling thread's input item.
+        ScanOpT          scan_op,            ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset,             ///< [in] Up-offset to pull from
+        Int2Type<true>  /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small integer
+    {
+        return InclusiveScanStep(input, scan_op, first_lane, offset);
+    }
+
+
+    /// Inclusive prefix scan step (specialized for types other than small integers size 32b or less)
+    template <typename _T, typename ScanOpT>
+    __device__ __forceinline__ _T InclusiveScanStep(
+        _T              input,              ///< [in] Calling thread's input item.
+        ScanOpT          scan_op,            ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset,             ///< [in] Up-offset to pull from
+        Int2Type<false> /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small integer
+    {
+        return InclusiveScanStep(input, scan_op, first_lane, offset);
+    }
+
+    //---------------------------------------------------------------------
+    // Templated inclusive scan iteration
+    //---------------------------------------------------------------------
+
+    template <typename _T, typename ScanOp, int STEP>
+    __device__ __forceinline__ void InclusiveScanStep(
+        _T&             input,              ///< [in] Calling thread's input item.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        Int2Type<STEP>  /*step*/)               ///< [in] Marker type indicating scan step
+    {
+        input = InclusiveScanStep(input, scan_op, first_lane, 1 << STEP, Int2Type<IntegerTraits<T>::IS_SMALL_UNSIGNED>());
+
+        InclusiveScanStep(input, scan_op, first_lane, Int2Type<STEP + 1>());
+    }
+
+    template <typename _T, typename ScanOp>
+    __device__ __forceinline__ void InclusiveScanStep(
+        _T&             /*input*/,              ///< [in] Calling thread's input item.
+        ScanOp          /*scan_op*/,            ///< [in] Binary scan operator
+        int             /*first_lane*/,         ///< [in] Index of first lane in segment
+        Int2Type<STEPS> /*step*/)               ///< [in] Marker type indicating scan step
+    {}
+
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+    //---------------------------------------------------------------------
+    // Broadcast
+    //---------------------------------------------------------------------
+
+    /// Broadcast
+    __device__ __forceinline__ T Broadcast(
+        T               input,              ///< [in] The value to broadcast
+        int             src_lane)           ///< [in] Which warp lane is to do the broadcasting
+    {
+        return ShuffleIndex(input, src_lane, LOGICAL_WARP_THREADS, member_mask);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive operations
+    //---------------------------------------------------------------------
+
+    /// Inclusive scan
+    template <typename _T, typename ScanOpT>
+    __device__ __forceinline__ void InclusiveScan(
+        _T              input,              ///< [in] Calling thread's input item.
+        _T              &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOpT         scan_op)            ///< [in] Binary scan operator
+    {
+        inclusive_output = input;
+
+        // Iterate scan steps
+        int segment_first_lane = 0;
+
+        // Iterate scan steps
+//        InclusiveScanStep(inclusive_output, scan_op, segment_first_lane, Int2Type<0>());
+
+        // Iterate scan steps
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            inclusive_output = InclusiveScanStep(
+                inclusive_output,
+                scan_op,
+                segment_first_lane,
+                (1 << STEP),
+                Int2Type<IntegerTraits<T>::IS_SMALL_UNSIGNED>());
+        }
+
+    }
+
+    /// Inclusive scan, specialized for reduce-value-by-key
+    template <typename KeyT, typename ValueT, typename ReductionOpT>
+    __device__ __forceinline__ void InclusiveScan(
+        KeyValuePair<KeyT, ValueT>      input,              ///< [in] Calling thread's input item.
+        KeyValuePair<KeyT, ValueT>      &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ReduceByKeyOp<ReductionOpT >    scan_op)            ///< [in] Binary scan operator
+    {
+        inclusive_output = input;
+
+        KeyT pred_key = ShuffleUp(inclusive_output.key, 1, 0, member_mask);
+
+        unsigned int ballot = WARP_BALLOT((pred_key != inclusive_output.key), member_mask);
+
+        // Mask away all lanes greater than ours
+        ballot = ballot & LaneMaskLe();
+
+        // Find index of first set bit
+        int segment_first_lane = CUB_MAX(0, 31 - __clz(ballot));
+
+        // Iterate scan steps
+//        InclusiveScanStep(inclusive_output.value, scan_op.op, segment_first_lane, Int2Type<0>());
+
+        // Iterate scan steps
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            inclusive_output.value = InclusiveScanStep(
+                inclusive_output.value,
+                scan_op.op,
+                segment_first_lane,
+                (1 << STEP),
+                Int2Type<IntegerTraits<T>::IS_SMALL_UNSIGNED>());
+        }
+    }
+
+
+    /// Inclusive scan with aggregate
+    template <typename ScanOpT>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOpT         scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InclusiveScan(input, inclusive_output, scan_op);
+
+        // Grab aggregate from last warp lane
+        warp_aggregate = ShuffleIndex(inclusive_output, LOGICAL_WARP_THREADS - 1, LOGICAL_WARP_THREADS, member_mask);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Get exclusive from inclusive
+    //---------------------------------------------------------------------
+
+    /// Update inclusive and exclusive using input and inclusive
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update(
+        T                       /*input*/,          ///< [in]
+        T                       &inclusive,         ///< [in, out]
+        T                       &exclusive,         ///< [out]
+        ScanOpT                 /*scan_op*/,        ///< [in]
+        IsIntegerT              /*is_integer*/)     ///< [in]
+    {
+        // initial value unknown
+        exclusive = ShuffleUp(inclusive, 1, 0, member_mask);
+    }
+
+    /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update(
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        cub::Sum                /*scan_op*/,
+        Int2Type<true>          /*is_integer*/)
+    {
+        // initial value presumed 0
+        exclusive = inclusive - input;
+    }
+
+    /// Update inclusive and exclusive using initial value using input, inclusive, and initial value
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       /*input*/,
+        T                       &inclusive,
+        T                       &exclusive,
+        ScanOpT                 scan_op,
+        T                       initial_value,
+        IsIntegerT              /*is_integer*/)
+    {
+        inclusive = scan_op(initial_value, inclusive);
+        exclusive = ShuffleUp(inclusive, 1, 0, member_mask);
+
+        unsigned int segment_id = (IS_ARCH_WARP) ?
+            lane_id :
+            lane_id % LOGICAL_WARP_THREADS;
+
+        if (segment_id == 0)
+            exclusive = initial_value;
+    }
+
+    /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        cub::Sum                scan_op,
+        T                       initial_value,
+        Int2Type<true>          /*is_integer*/)
+    {
+        inclusive = scan_op(initial_value, inclusive);
+        exclusive = inclusive - input;
+    }
+
+
+    /// Update inclusive, exclusive, and warp aggregate using input and inclusive
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        ScanOpT                 scan_op,
+        IsIntegerT              is_integer)
+    {
+        warp_aggregate = ShuffleIndex(inclusive, LOGICAL_WARP_THREADS - 1, LOGICAL_WARP_THREADS, member_mask);
+        Update(input, inclusive, exclusive, scan_op, is_integer);
+    }
+
+    /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        ScanOpT                 scan_op,
+        T                       initial_value,
+        IsIntegerT              is_integer)
+    {
+        warp_aggregate = ShuffleIndex(inclusive, LOGICAL_WARP_THREADS - 1, LOGICAL_WARP_THREADS, member_mask);
+        Update(input, inclusive, exclusive, scan_op, initial_value, is_integer);
+    }
+
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cpp/nvgraph/external/cub_semiring/warp/specializations/warp_scan_smem.cuh b/cpp/nvgraph/external/cub_semiring/warp/specializations/warp_scan_smem.cuh
new file mode 100644
index 00000000000..c3a7a94ba26
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/warp/specializations/warp_scan_smem.cuh
@@ -0,0 +1,397 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../../thread/thread_operators.cuh"
+#include "../../thread/thread_load.cuh"
+#include "../../thread/thread_store.cuh"
+#include "../../util_type.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ */
+template <
+    typename    T,                      ///< Data type being scanned
+    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+struct WarpScanSmem
+{
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Whether the logical warp size is a power-of-two
+        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of warp scan steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of threads in half a warp
+        HALF_WARP_THREADS = 1 << (STEPS - 1),
+
+        /// The number of shared memory elements per warp
+        WARP_SMEM_ELEMENTS =  LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
+    };
+
+    /// Storage cell type (workaround for SM1x compiler bugs with custom-ops like Max() on signed chars)
+    typedef typename If<((Equals<T, char>::VALUE || Equals<T, signed char>::VALUE) && (PTX_ARCH < 200)), int, T>::Type CellT;
+
+    /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
+    typedef CellT _TempStorage[WARP_SMEM_ELEMENTS];
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    _TempStorage    &temp_storage;
+    unsigned int    lane_id;
+    unsigned int    member_mask;
+
+
+    /******************************************************************************
+     * Construction
+     ******************************************************************************/
+
+    /// Constructor
+    __device__ __forceinline__ WarpScanSmem(
+        TempStorage     &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+
+        lane_id(IS_ARCH_WARP ?
+            LaneId() :
+            LaneId() % LOGICAL_WARP_THREADS),
+
+        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ?
+            0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp
+            ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
+    {}
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Basic inclusive scan iteration (template unrolled, inductive-case specialization)
+    template <
+        bool        HAS_IDENTITY,
+        int         STEP,
+        typename    ScanOp>
+    __device__ __forceinline__ void ScanStep(
+        T                       &partial,
+        ScanOp                  scan_op,
+        Int2Type<STEP>          /*step*/)
+    {
+        const int OFFSET = 1 << STEP;
+
+        // Share partial into buffer
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) partial);
+
+        WARP_SYNC(member_mask);
+
+        // Update partial if addend is in range
+        if (HAS_IDENTITY || (lane_id >= OFFSET))
+        {
+            T addend = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - OFFSET]);
+            partial = scan_op(addend, partial);
+        }
+        WARP_SYNC(member_mask);
+
+        ScanStep<HAS_IDENTITY>(partial, scan_op, Int2Type<STEP + 1>());
+    }
+
+
+    /// Basic inclusive scan iteration(template unrolled, base-case specialization)
+    template <
+        bool        HAS_IDENTITY,
+        typename    ScanOp>
+    __device__ __forceinline__ void ScanStep(
+        T                       &/*partial*/,
+        ScanOp                  /*scan_op*/,
+        Int2Type<STEPS>         /*step*/)
+    {}
+
+
+    /// Inclusive prefix scan (specialized for summation across primitive types)
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,              ///< [in] Calling thread's input item.
+        T                       &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        Sum                     scan_op,            ///< [in] Binary scan operator
+        Int2Type<true>          /*is_primitive*/)   ///< [in] Marker type indicating whether T is primitive type
+    {
+        T identity = 0;
+        ThreadStore<STORE_VOLATILE>(&temp_storage[lane_id], (CellT) identity);
+
+        WARP_SYNC(member_mask);
+
+        // Iterate scan steps
+        output = input;
+        ScanStep<true>(output, scan_op, Int2Type<0>());
+    }
+
+
+    /// Inclusive prefix scan
+    template <typename ScanOp, int IS_PRIMITIVE>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,              ///< [in] Calling thread's input item.
+        T                       &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp                  scan_op,            ///< [in] Binary scan operator
+        Int2Type<IS_PRIMITIVE>  /*is_primitive*/)   ///< [in] Marker type indicating whether T is primitive type
+    {
+        // Iterate scan steps
+        output = input;
+        ScanStep<false>(output, scan_op, Int2Type<0>());
+    }
+
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+    //---------------------------------------------------------------------
+    // Broadcast
+    //---------------------------------------------------------------------
+
+    /// Broadcast
+    __device__ __forceinline__ T Broadcast(
+        T               input,              ///< [in] The value to broadcast
+        unsigned int    src_lane)           ///< [in] Which warp lane is to do the broadcasting
+    {
+        if (lane_id == src_lane)
+        {
+            ThreadStore<STORE_VOLATILE>(temp_storage, (CellT) input);
+        }
+
+        WARP_SYNC(member_mask);
+
+        return (T)ThreadLoad<LOAD_VOLATILE>(temp_storage);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive operations
+    //---------------------------------------------------------------------
+
+    /// Inclusive scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InclusiveScan(input, inclusive_output, scan_op, Int2Type<Traits<T>::PRIMITIVE>());
+    }
+
+
+    /// Inclusive scan with aggregate
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InclusiveScan(input, inclusive_output, scan_op);
+
+        // Retrieve aggregate
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive_output);
+
+        WARP_SYNC(member_mask);
+
+        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+
+        WARP_SYNC(member_mask);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Get exclusive from inclusive
+    //---------------------------------------------------------------------
+
+    /// Update inclusive and exclusive using input and inclusive
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update(
+        T                       /*input*/,      ///< [in]
+        T                       &inclusive,     ///< [in, out]
+        T                       &exclusive,     ///< [out]
+        ScanOpT                 /*scan_op*/,    ///< [in]
+        IsIntegerT              /*is_integer*/) ///< [in]
+    {
+        // initial value unknown
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
+    }
+
+    /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update(
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        cub::Sum                /*scan_op*/,
+        Int2Type<true>          /*is_integer*/)
+    {
+        // initial value presumed 0
+        exclusive = inclusive - input;
+    }
+
+    /// Update inclusive and exclusive using initial value using input, inclusive, and initial value
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       /*input*/,
+        T                       &inclusive,
+        T                       &exclusive,
+        ScanOpT                 scan_op,
+        T                       initial_value,
+        IsIntegerT              /*is_integer*/)
+    {
+        inclusive = scan_op(initial_value, inclusive);
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
+        if (lane_id == 0)
+            exclusive = initial_value;
+    }
+
+    /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        cub::Sum                scan_op,
+        T                       initial_value,
+        Int2Type<true>          /*is_integer*/)
+    {
+        inclusive = scan_op(initial_value, inclusive);
+        exclusive = inclusive - input;
+    }
+
+
+    /// Update inclusive, exclusive, and warp aggregate using input and inclusive
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       /*input*/,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        ScanOpT                 /*scan_op*/,
+        IsIntegerT              /*is_integer*/)
+    {
+        // Initial value presumed to be unknown or identity (either way our padding is correct)
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
+        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+    }
+
+    /// Update inclusive, exclusive, and warp aggregate using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        cub::Sum                /*scan_o*/,
+        Int2Type<true>          /*is_integer*/)
+    {
+        // Initial value presumed to be unknown or identity (either way our padding is correct)
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+        exclusive = inclusive - input;
+    }
+
+    /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       /*input*/,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        ScanOpT                 scan_op,
+        T                       initial_value,
+        IsIntegerT              /*is_integer*/)
+    {
+        // Broadcast warp aggregate
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+
+        WARP_SYNC(member_mask);
+
+        // Update inclusive with initial value
+        inclusive = scan_op(initial_value, inclusive);
+
+        // Get exclusive from exclusive
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 2]);
+
+        if (lane_id == 0)
+            exclusive = initial_value;
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cpp/nvgraph/external/cub_semiring/warp/warp_reduce.cuh b/cpp/nvgraph/external/cub_semiring/warp/warp_reduce.cuh
new file mode 100644
index 00000000000..ef78dd6a009
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/warp/warp_reduce.cuh
@@ -0,0 +1,612 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "specializations/warp_reduce_shfl.cuh"
+#include "specializations/warp_reduce_smem.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../util_arch.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup WarpModule
+ * @{
+ */
+
+/**
+ * \brief The WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp. ![](warp_reduce_logo.png)
+ *
+ * \tparam T                        The reduction input/output element type
+ * \tparam LOGICAL_WARP_THREADS     <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads).  Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20).
+ * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ *   uses a binary combining operator to compute a single aggregate from a list of input elements.
+ * - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads)
+ * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
+ *
+ * \par Performance Considerations
+ * - Uses special instructions when applicable (e.g., warp \p SHFL instructions)
+ * - Uses synchronization-free communication between warp lanes when applicable
+ * - Incurs zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *     - Summation (<b><em>vs.</em></b> generic reduction)
+ *     - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
+ *
+ * \par Simple Examples
+ * \warpcollective{WarpReduce}
+ * \par
+ * The code snippet below illustrates four concurrent warp sum reductions within a block of
+ * 128 threads (one per each of the 32-thread warps).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpReduce for type int
+ *     typedef cub::WarpReduce<int> WarpReduce;
+ *
+ *     // Allocate WarpReduce shared memory for 4 warps
+ *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
+ *
+ *     // Obtain one input item per thread
+ *     int thread_data = ...
+ *
+ *     // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96)
+ *     int warp_id = threadIdx.x / 32;
+ *     int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
+ * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
+ * \p 2544, and \p 3568, respectively (and is undefined in other threads).
+ *
+ * \par
+ * The code snippet below illustrates a single warp sum reduction within a block of
+ * 128 threads.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpReduce for type int
+ *     typedef cub::WarpReduce<int> WarpReduce;
+ *
+ *     // Allocate WarpReduce shared memory for one warp
+ *     __shared__ typename WarpReduce::TempStorage temp_storage;
+ *     ...
+ *
+ *     // Only the first warp performs a reduction
+ *     if (threadIdx.x < 32)
+ *     {
+ *         // Obtain one input item per thread
+ *         int thread_data = ...
+ *
+ *         // Return the warp-wide sum to lane0
+ *         int aggregate = WarpReduce(temp_storage).Sum(thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the warp of threads is <tt>{0, 1, 2, 3, ..., 31}</tt>.
+ * The corresponding output \p aggregate in thread0 will be \p 496 (and is undefined in other threads).
+ *
+ */
+template <
+    typename    T,
+    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
+    int         PTX_ARCH                = CUB_PTX_ARCH>
+class WarpReduce
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Whether the logical warp size is a power-of-two
+        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
+    };
+
+public:
+
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /// Internal specialization.  Use SHFL-based reduction if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two)
+    typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO),
+        WarpReduceShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>,
+        WarpReduceSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH> >::Type InternalWarpReduce;
+
+    #endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+private:
+
+    /// Shared memory storage layout type for WarpReduce
+    typedef typename InternalWarpReduce::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+public:
+
+    /// \smemstorage{WarpReduce}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ WarpReduce(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias())
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Summation reductions
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a warp-wide sum in the calling warp.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp sum reductions within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for 4 warps
+     *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int warp_id = threadIdx.x / 32;
+     *     int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
+     * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
+     * \p 2544, and \p 3568, respectively (and is undefined in other threads).
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T                   input)              ///< [in] Calling thread's input
+    {
+        return InternalWarpReduce(temp_storage).template Reduce<true, 1>(input, LOGICAL_WARP_THREADS, cub::Sum());
+    }
+
+    /**
+     * \brief Computes a partially-full warp-wide sum in the calling warp.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * All threads across the calling warp must agree on the same value for \p valid_items.  Otherwise the result is undefined.
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sum reduction within a single, partially-full
+     * block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread if in range
+     *     int thread_data;
+     *     if (threadIdx.x < valid_items)
+     *         thread_data = d_data[threadIdx.x];
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int aggregate = WarpReduce(temp_storage).Sum(
+     *         thread_data, valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>{0, 1, 2, 3, 4, ...</tt> and \p valid_items
+     * is \p 4.  The corresponding output \p aggregate in thread0 is \p 6 (and is
+     * undefined in other threads).
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T                   input,              ///< [in] Calling thread's input
+        int                 valid_items)        ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
+    {
+        // Determine if we don't need bounds checking
+        return InternalWarpReduce(temp_storage).template Reduce<false, 1>(input, valid_items, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes a segmented sum in the calling warp where segments are defined by head-flags.  The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a head-segmented warp sum
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int head_flag = ...
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int aggregate = WarpReduce(temp_storage).HeadSegmentedSum(
+     *         thread_data, head_flag);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p head_flag across the block of threads
+     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     *
+     */
+    template <
+        typename            FlagT>
+    __device__ __forceinline__ T HeadSegmentedSum(
+        T                   input,              ///< [in] Calling thread's input
+        FlagT                head_flag)          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
+    {
+        return HeadSegmentedReduce(input, head_flag, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes a segmented sum in the calling warp where segments are defined by tail-flags.  The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a tail-segmented warp sum
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int tail_flag = ...
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int aggregate = WarpReduce(temp_storage).TailSegmentedSum(
+     *         thread_data, tail_flag);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p tail_flag across the block of threads
+     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            FlagT>
+    __device__ __forceinline__ T TailSegmentedSum(
+        T                   input,              ///< [in] Calling thread's input
+        FlagT                tail_flag)          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
+    {
+        return TailSegmentedReduce(input, tail_flag, cub::Sum());
+    }
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Generic reductions
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes a warp-wide reduction in the calling warp using the specified binary reduction functor.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp max reductions within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for 4 warps
+     *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int warp_id = threadIdx.x / 32;
+     *     int aggregate = WarpReduce(temp_storage[warp_id]).Reduce(
+     *         thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
+     * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 31, \p 63,
+     * \p 95, and \p 127, respectively  (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,              ///< [in] Calling thread's input
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        return InternalWarpReduce(temp_storage).template Reduce<true, 1>(input, LOGICAL_WARP_THREADS, reduction_op);
+    }
+
+    /**
+     * \brief Computes a partially-full warp-wide reduction in the calling warp using the specified binary reduction functor.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * All threads across the calling warp must agree on the same value for \p valid_items.  Otherwise the result is undefined.
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a max reduction within a single, partially-full
+     * block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread if in range
+     *     int thread_data;
+     *     if (threadIdx.x < valid_items)
+     *         thread_data = d_data[threadIdx.x];
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int aggregate = WarpReduce(temp_storage).Reduce(
+     *         thread_data, cub::Max(), valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>{0, 1, 2, 3, 4, ...</tt> and \p valid_items
+     * is \p 4.  The corresponding output \p aggregate in thread0 is \p 3 (and is
+     * undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,              ///< [in] Calling thread's input
+        ReductionOp         reduction_op,       ///< [in] Binary reduction operator
+        int                 valid_items)        ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
+    {
+        return InternalWarpReduce(temp_storage).template Reduce<false, 1>(input, valid_items, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a segmented reduction in the calling warp where segments are defined by head-flags.  The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a head-segmented warp max
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int head_flag = ...
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce(
+     *         thread_data, head_flag, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p head_flag across the block of threads
+     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            ReductionOp,
+        typename            FlagT>
+    __device__ __forceinline__ T HeadSegmentedReduce(
+        T                   input,              ///< [in] Calling thread's input
+        FlagT                head_flag,          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
+        ReductionOp         reduction_op)       ///< [in] Reduction operator
+    {
+        return InternalWarpReduce(temp_storage).template SegmentedReduce<true>(input, head_flag, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a segmented reduction in the calling warp where segments are defined by tail-flags.  The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a tail-segmented warp max
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int tail_flag = ...
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int aggregate = WarpReduce(temp_storage).TailSegmentedReduce(
+     *         thread_data, tail_flag, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p tail_flag across the block of threads
+     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            ReductionOp,
+        typename            FlagT>
+    __device__ __forceinline__ T TailSegmentedReduce(
+        T                   input,              ///< [in] Calling thread's input
+        FlagT                tail_flag,          ///< [in] Tail flag denoting whether or not \p input is the end of the current segment
+        ReductionOp         reduction_op)       ///< [in] Reduction operator
+    {
+        return InternalWarpReduce(temp_storage).template SegmentedReduce<false>(input, tail_flag, reduction_op);
+    }
+
+
+
+    //@}  end member group
+};
+
+/** @} */       // end group WarpModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cpp/nvgraph/external/cub_semiring/warp/warp_scan.cuh b/cpp/nvgraph/external/cub_semiring/warp/warp_scan.cuh
new file mode 100644
index 00000000000..3f78ca8a090
--- /dev/null
+++ b/cpp/nvgraph/external/cub_semiring/warp/warp_scan.cuh
@@ -0,0 +1,936 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::WarpScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "specializations/warp_scan_shfl.cuh"
+#include "specializations/warp_scan_smem.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../util_arch.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup WarpModule
+ * @{
+ */
+
+/**
+ * \brief The WarpScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp.  ![](warp_scan_logo.png)
+ *
+ * \tparam T                        The scan input/output element type
+ * \tparam LOGICAL_WARP_THREADS     <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads).  Default is the warp size associated with the CUDA Compute Capability targeted by the compiler (e.g., 32 threads for SM20).
+ * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
+ *   produces an output list where each element is computed to be the reduction
+ *   of the elements occurring earlier in the input list.  <em>Prefix sum</em>
+ *   connotes a prefix scan with the addition operator. The term \em inclusive indicates
+ *   that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
+ *   The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
+ *   the <em>i</em><sup>th</sup> output reduction.
+ * - Supports non-commutative scan operators
+ * - Supports "logical" warps smaller than the physical warp size (e.g., a logical warp of 8 threads)
+ * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
+ *
+ * \par Performance Considerations
+ * - Uses special instructions when applicable (e.g., warp \p SHFL)
+ * - Uses synchronization-free communication between warp lanes when applicable
+ * - Incurs zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *     - Summation (<b><em>vs.</em></b> generic scan)
+ *     - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
+ *
+ * \par Simple Examples
+ * \warpcollective{WarpScan}
+ * \par
+ * The code snippet below illustrates four concurrent warp prefix sums within a block of
+ * 128 threads (one per each of the 32-thread warps).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpScan for type int
+ *     typedef cub::WarpScan<int> WarpScan;
+ *
+ *     // Allocate WarpScan shared memory for 4 warps
+ *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+ *
+ *     // Obtain one input item per thread
+ *     int thread_data = ...
+ *
+ *     // Compute warp-wide prefix sums
+ *     int warp_id = threadIdx.x / 32;
+ *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+ * The corresponding output \p thread_data in each of the four warps of threads will be
+ * <tt>0, 1, 2, 3, ..., 31}</tt>.
+ *
+ * \par
+ * The code snippet below illustrates a single warp prefix sum within a block of
+ * 128 threads.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpScan for type int
+ *     typedef cub::WarpScan<int> WarpScan;
+ *
+ *     // Allocate WarpScan shared memory for one warp
+ *     __shared__ typename WarpScan::TempStorage temp_storage;
+ *     ...
+ *
+ *     // Only the first warp performs a prefix sum
+ *     if (threadIdx.x < 32)
+ *     {
+ *         // Obtain one input item per thread
+ *         int thread_data = ...
+ *
+ *         // Compute warp-wide prefix sums
+ *         WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the warp of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+ * The corresponding output \p thread_data will be <tt>{0, 1, 2, 3, ..., 31}</tt>.
+ *
+ */
+template <
+    typename    T,
+    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
+    int         PTX_ARCH                = CUB_PTX_ARCH>
+class WarpScan
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Whether the logical warp size is a power-of-two
+        IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
+
+        /// Whether the data type is an integer (which has fully-associative addition)
+        IS_INTEGER = ((Traits<T>::CATEGORY == SIGNED_INTEGER) || (Traits<T>::CATEGORY == UNSIGNED_INTEGER))
+    };
+
+    /// Internal specialization.  Use SHFL-based scan if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two)
+    typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO),
+        WarpScanShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>,
+        WarpScanSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH> >::Type InternalWarpScan;
+
+    /// Shared memory storage layout type for WarpScan
+    typedef typename InternalWarpScan::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage    &temp_storage;
+    unsigned int    lane_id;
+
+
+
+    /******************************************************************************
+     * Public types
+     ******************************************************************************/
+
+public:
+
+    /// \smemstorage{WarpScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ WarpScan(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        lane_id(IS_ARCH_WARP ?
+            LaneId() :
+            LaneId() % LOGICAL_WARP_THREADS)
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix sums
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive prefix sum across the calling warp.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix sums
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>1, 2, 3, ..., 32}</tt>.
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output)  ///< [out] Calling thread's output item.  May be aliased with \p input.
+    {
+        InclusiveScan(input, inclusive_output, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an inclusive prefix sum across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix sums
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data, warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>1, 2, 3, ..., 32}</tt>.  Furthermore, \p warp_aggregate for all threads in all warps will be \p 32.
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InclusiveScan(input, inclusive_output, cub::Sum(), warp_aggregate);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix sums
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive prefix sum across the calling warp.  The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in <em>thread</em><sub>0</sub>.
+     *
+     * \par
+     *  - \identityzero
+     *  - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix sums
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>0, 1, 2, ..., 31}</tt>.
+     *
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output)  ///< [out] Calling thread's output item.  May be aliased with \p input.
+    {
+        T initial_value = 0;
+        ExclusiveScan(input, exclusive_output, initial_value, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix sum across the calling warp.  The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in <em>thread</em><sub>0</sub>.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     *  - \identityzero
+     *  - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix sums
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data, warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>0, 1, 2, ..., 31}</tt>.  Furthermore, \p warp_aggregate for all threads in all warps will be \p 32.
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        T initial_value = 0;
+        ExclusiveScan(input, exclusive_output, initial_value, cub::Sum(), warp_aggregate);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix scans
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp.
+     *
+     * \par
+     *  - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix max scans
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix max scans
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).InclusiveScan(
+     *         thread_data, thread_data, cub::Max(), warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
+     * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
+     * in the second warp, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op, warp_aggregate);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix scans
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p output computed for <em>warp-lane</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan internal(temp_storage);
+
+        T inclusive_output;
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            scan_op,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               initial_value,      ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan internal(temp_storage);
+
+        T inclusive_output;
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            scan_op,
+            initial_value,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p output computed for <em>warp-lane</em><sub>0</sub> is undefined.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max(), warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)  Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
+     * in the second warp, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,   ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InternalWarpScan internal(temp_storage);
+
+        T inclusive_output;
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            warp_aggregate,
+            scan_op,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
+     * in the second warp, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               initial_value,      ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InternalWarpScan internal(temp_storage);
+
+        T inclusive_output;
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            warp_aggregate,
+            scan_op,
+            initial_value,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Combination (inclusive & exclusive) prefix scans
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p exclusive_output computed for <em>warp-lane</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int inclusive_partial, exclusive_partial;
+     *     WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p inclusive_partial in the first warp would be
+     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
+     * The corresponding output \p exclusive_partial in the first warp would be
+     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void Scan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
+        T               &exclusive_output,  ///< [out] Calling thread's exclusive-scan output item.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan internal(temp_storage);
+
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            scan_op,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    /**
+     * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp.
+     *
+     * \par
+     *  - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix max scans
+     *     int warp_id = threadIdx.x / 32;
+     *     int inclusive_partial, exclusive_partial;
+     *     WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p inclusive_partial in the first warp would be
+     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
+     * The corresponding output \p exclusive_partial in the first warp would be
+     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void Scan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
+        T               &exclusive_output,  ///< [out] Calling thread's exclusive-scan output item.
+        T               initial_value,      ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan internal(temp_storage);
+
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            scan_op,
+            initial_value,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Data exchange
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Broadcast the value \p input from <em>warp-lane</em><sub><tt>src_lane</tt></sub> to all lanes in the warp
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the warp-wide broadcasts of values from
+     * lanes<sub>0</sub> in each of four warps to all other threads in those warps.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Broadcast from lane0 in each warp to all other threads in the warp
+     *     int warp_id = threadIdx.x / 32;
+     *     thread_data = WarpScan(temp_storage[warp_id]).Broadcast(thread_data, 0);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
+     * The corresponding output \p thread_data will be
+     * <tt>{0, 0, ..., 0}</tt> in warp<sub>0</sub>,
+     * <tt>{32, 32, ..., 32}</tt> in warp<sub>1</sub>,
+     * <tt>{64, 64, ..., 64}</tt> in warp<sub>2</sub>, etc.
+     */
+    __device__ __forceinline__ T Broadcast(
+        T               input,              ///< [in] The value to broadcast
+        unsigned int    src_lane)           ///< [in] Which warp lane is to do the broadcasting
+    {
+        return InternalWarpScan(temp_storage).Broadcast(input, src_lane);
+    }
+
+    //@}  end member group
+
+};
+
+/** @} */       // end group WarpModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/cpp/nvgraph/external/cusp b/cpp/nvgraph/external/cusp
new file mode 160000
index 00000000000..a8a8df763d8
--- /dev/null
+++ b/cpp/nvgraph/external/cusp
@@ -0,0 +1 @@
+Subproject commit a8a8df763d8ae459188e178fd6c28567a42473c2
diff --git a/cpp/nvgraph/external/cusparse_internal.h b/cpp/nvgraph/external/cusparse_internal.h
new file mode 100644
index 00000000000..8085b2abdfc
--- /dev/null
+++ b/cpp/nvgraph/external/cusparse_internal.h
@@ -0,0 +1,3060 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if !defined(CUSPARSE_INTERNAL_H_)
+#define CUSPARSE_INTERNAL_H_
+
+
+#ifndef CUSPARSEAPI
+#ifdef _WIN32
+#define CUSPARSEAPI __stdcall
+#else
+#define CUSPARSEAPI 
+#endif
+#endif
+
+
+#define CACHE_LINE_SIZE   128 
+
+#define ALIGN_32(x)   ((((x)+31)/32)*32)
+
+
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+
+struct csrilu02BatchInfo;
+typedef struct csrilu02BatchInfo *csrilu02BatchInfo_t;
+
+
+struct csrxilu0Info;
+typedef struct csrxilu0Info *csrxilu0Info_t;
+
+struct csrxgemmSchurInfo;
+typedef struct csrxgemmSchurInfo *csrxgemmSchurInfo_t;
+
+struct csrxtrsmInfo;
+typedef struct csrxtrsmInfo  *csrxtrsmInfo_t;
+
+struct csrilu03Info;
+typedef struct csrilu03Info *csrilu03Info_t;
+
+struct csrmmInfo;
+typedef struct csrmmInfo *csrmmInfo_t;
+
+
+cudaStream_t cusparseGetStreamInternal(const struct cusparseContext *ctx);
+
+
+cusparseStatus_t CUSPARSEAPI cusparseCheckBuffer(
+    cusparseHandle_t handle,
+    void *workspace);
+
+//------- gather: dst = src(map) ---------------------
+
+cusparseStatus_t CUSPARSEAPI cusparseIgather(
+    cusparseHandle_t handle,
+    int n,
+    const int *src,
+    const int *map,
+    int *dst);
+
+cusparseStatus_t CUSPARSEAPI cusparseSgather(
+    cusparseHandle_t handle,
+    int n,
+    const float *src,
+    const int *map,
+    float *dst);
+
+cusparseStatus_t CUSPARSEAPI cusparseDgather(
+    cusparseHandle_t handle,
+    int n,
+    const double *src,
+    const int *map,
+    double *dst);
+
+cusparseStatus_t CUSPARSEAPI cusparseCgather(
+    cusparseHandle_t handle,
+    int n,
+    const cuComplex *src,
+    const int *map,
+    cuComplex *dst);
+
+cusparseStatus_t CUSPARSEAPI cusparseZgather(
+    cusparseHandle_t handle,
+    int n,
+    const cuDoubleComplex *src,
+    const int *map,
+    cuDoubleComplex *dst);
+
+
+//------- scatter: dst(map) = src ---------------------
+
+cusparseStatus_t CUSPARSEAPI cusparseIscatter(
+    cusparseHandle_t handle,
+    int n,
+    const int *src,
+    int *dst,
+    const int *map);
+
+cusparseStatus_t CUSPARSEAPI cusparseSscatter(
+    cusparseHandle_t handle,
+    int n,
+    const float *src,
+    float *dst,
+    const int *map);
+
+cusparseStatus_t CUSPARSEAPI cusparseDscatter(
+    cusparseHandle_t handle,
+    int n,
+    const double *src,
+    double *dst,
+    const int *map);
+
+cusparseStatus_t CUSPARSEAPI cusparseCscatter(
+    cusparseHandle_t handle,
+    int n,
+    const cuComplex *src,
+    cuComplex *dst,
+    const int *map);
+
+cusparseStatus_t CUSPARSEAPI cusparseZscatter(
+    cusparseHandle_t handle,
+    int n,
+    const cuDoubleComplex *src,
+    cuDoubleComplex *dst,
+    const int *map);
+
+
+// x[j] = j 
+cusparseStatus_t CUSPARSEAPI cusparseIidentity(
+    cusparseHandle_t handle,
+    int n,
+    int *x);
+
+// x[j] = val
+cusparseStatus_t CUSPARSEAPI cusparseImemset(
+    cusparseHandle_t handle,
+    int n,
+    int val,
+    int *x);
+
+cusparseStatus_t CUSPARSEAPI cusparseI64memset(
+    cusparseHandle_t handle,
+    size_t n,
+    int val,
+    int *x);
+
+
+// ----------- reduce -----------------
+
+/*
+ * cusparseStatus_t 
+ *      cusparseIreduce_bufferSize( cusparseHandle_t handle,
+ *                                   int n,
+ *                                   int *pBufferSizeInBytes)
+ * Input
+ * -----
+ * handle        handle to CUSPARSE library context.
+ * n             number of elements.
+ *
+ * Output
+ * ------
+ * pBufferSizeInBytes   size of working space in bytes.
+ *  
+ * Error Status
+ * ------------
+ * CUSPARSE_STATUS_SUCCESS          the operation completed successfully.
+ * CUSPARSE_STATUS_NOT_INITIALIZED  the library was not initialized.   
+ * CUSPARSE_STATUS_INVALID_VALUE    n is too big or negative
+ * CUSPARSE_STATUS_INTERNAL_ERROR   an internal operation failed.
+ *                                  If n is normal, we should not have this internal error.
+ *
+ * ---------
+ * Assumption:
+ *    Only support n < 2^31.
+ *
+ */
+cusparseStatus_t CUSPARSEAPI cusparseIreduce_bufferSizeExt(
+    cusparseHandle_t handle,
+    int n,
+    size_t *pBufferSizeInBytes);
+
+/*
+ * cusparseStatus_t 
+ *     cusparseIreduce(cusparseHandle_t handle,
+ *                     int n,
+ *                     int *src,
+ *                     int *pBuffer,
+ *                     int *total_sum)
+ *  
+ *    total_sum = reduction(src)
+ *
+ *  Input
+ * -------
+ *  handle            handle to the CUSPARSE library context.
+ *    n               number of elements in src and dst.
+ *  src               <int> array of n elements.
+ *  pBuffer           working space, the size is reported by cusparseIinclusiveScan_bufferSizeExt.
+ *                    Or it can be a NULL pointer, then CUSPARSE library allocates working space implicitly.
+ *
+ * Output
+ * -------
+ *  total_sum         total_sum = reduction(src) if total_sum is not a NULL pointer.
+ *
+ *
+ * Error Status
+ * ------------
+ * CUSPARSE_STATUS_SUCCESS          the operation completed successfully.
+ * CUSPARSE_STATUS_NOT_INITIALIZED  the library was not initialized.   
+ * CUSPARSE_STATUS_ALLOC_FAILED     the resources could not be allocated.
+ *                                  it is possible if pBuffer is NULL.
+ * CUSPARSE_STATUS_INTERNAL_ERROR   an internal operation failed.
+ *
+ * 
+ */
+cusparseStatus_t CUSPARSEAPI cusparseIreduce(
+    cusparseHandle_t handle,
+    int n,
+    int *src,
+    void *pBuffer,
+    int *total_sum);
+
+
+
+// ----------- prefix sum -------------------
+
+/*
+ * cusparseStatus_t 
+ *      cusparseIinclusiveScan_bufferSizeExt( cusparseHandle_t handle,
+ *                                   int n,
+ *                                   size_t *pBufferSizeInBytes)
+ * Input
+ * -----
+ * handle        handle to CUSPARSE library context.
+ * n             number of elements.
+ *
+ * Output
+ * ------
+ * pBufferSizeInBytes   size of working space in bytes.
+ *  
+ * Error Status
+ * ------------
+ * CUSPARSE_STATUS_SUCCESS          the operation completed successfully.
+ * CUSPARSE_STATUS_NOT_INITIALIZED  the library was not initialized.   
+ * CUSPARSE_STATUS_INVALID_VALUE    n is too big or negative
+ * CUSPARSE_STATUS_INTERNAL_ERROR   an internal operation failed.
+ *                                  If n is normal, we should not have this internal error.
+ *
+ * ---------
+ * Assumption:
+ *    Only support n < 2^31.
+ *
+ */
+cusparseStatus_t CUSPARSEAPI cusparseIinclusiveScan_bufferSizeExt(
+    cusparseHandle_t handle,
+    int n,
+    size_t *pBufferSizeInBytes);
+
+
+/*
+ * cusparseStatus_t 
+ *     cusparseIinclusiveScan(cusparseHandle_t handle,
+ *                             int base,
+ *                             int n,
+ *                             int *src,
+ *                             void *pBuffer,
+ *                             int *dst,
+ *                             int *total_sum)
+ *  
+ *    dst = inclusiveScan(src) + base
+ *    total_sum = reduction(src)
+ *
+ *  Input
+ * -------
+ *  handle            handle to the CUSPARSE library context.
+ *    n               number of elements in src and dst.
+ *  src               <int> array of n elements.
+ *  pBuffer           working space, the size is reported by cusparseIinclusiveScan_bufferSizeExt.
+ *                    Or it can be a NULL pointer, then CUSPARSE library allocates working space implicitly.
+ *
+ * Output
+ * -------
+ *  dst               <int> array of n elements.
+ *                    dst = inclusiveScan(src) + base
+ *  total_sum         total_sum = reduction(src) if total_sum is not a NULL pointer.
+ *
+ * Error Status
+ * ------------
+ * CUSPARSE_STATUS_SUCCESS          the operation completed successfully.
+ * CUSPARSE_STATUS_NOT_INITIALIZED  the library was not initialized.   
+ * CUSPARSE_STATUS_ALLOC_FAILED     the resources could not be allocated.
+ *                                  it is possible if pBuffer is NULL.
+ * CUSPARSE_STATUS_INTERNAL_ERROR   an internal operation failed.
+ * 
+ */
+cusparseStatus_t CUSPARSEAPI cusparseIinclusiveScan(
+    cusparseHandle_t handle,
+    int base,
+    int n,
+    int *src,
+    void *pBuffer,
+    int *dst,
+    int *total_sum);
+
+// ----------- stable sort -----------------
+
+/*
+ * cusparseStatus_t 
+ *      cusparseIstableSortByKey_bufferSizeExt( cusparseHandle_t handle,
+ *                                   int n,
+ *                                   size_t *pBufferSizeInBytes)
+ * Input
+ * -----
+ * handle        handle to CUSPARSE library context.
+ * n             number of elements.
+ *
+ * Output
+ * ------
+ * pBufferSizeInBytes   size of working space in bytes.
+ *  
+ * Error Status
+ * ------------
+ * CUSPARSE_STATUS_SUCCESS          the operation completed successfully.
+ * CUSPARSE_STATUS_NOT_INITIALIZED  the library was not initialized.   
+ * CUSPARSE_STATUS_INVALID_VALUE    n is too big or negative
+ * CUSPARSE_STATUS_INTERNAL_ERROR   an internal operation failed.
+ *                                  If n is normal, we should not have this internal error.
+ *
+ * ---------
+ * Assumption:
+ *    Only support n < 2^30 because of domino scheme. 
+ *
+ */
+cusparseStatus_t CUSPARSEAPI cusparseIstableSortByKey_bufferSizeExt(
+    cusparseHandle_t handle,
+    int n,
+    size_t *pBufferSizeInBytes);
+
+
+/*
+ * cusparseStatus_t 
+ *      cusparseIstableSortByKey( cusparseHandle_t handle,
+ *                                   int n,
+ *                                   int *key,
+ *                                   int *P)
+ *
+ *  in-place radix sort. 
+ *  This is an inhouse design of thrust::stable_sort_by_key(key, P)
+ *
+ * Input
+ * -----
+ * handle    handle to CUSPARSE library context.
+ * n         number of elements.
+ * key       <int> array of n elements.  
+ * P         <int> array of n elements.  
+ * pBuffer   working space, the size is reported by cusparseIstableSortByKey_bufferSize.
+ *           Or it can be a NULL pointer, then CUSPARSE library allocates working space implicitly.
+ *
+ * Output
+ * ------
+ * key       <int> array of n elements.  
+ * P         <int> array of n elements.  
+ *
+ * Error Status
+ * ------------
+ * CUSPARSE_STATUS_SUCCESS          the operation completed successfully.
+ * CUSPARSE_STATUS_NOT_INITIALIZED  the library was not initialized.   
+ * CUSPARSE_STATUS_ALLOC_FAILED     the resources could not be allocated.
+ * CUSPARSE_STATUS_INTERNAL_ERROR   an internal operation failed.
+ *
+ * -----
+ * Assumption:
+ *    Only support n < 2^30 because of domino scheme. 
+ *
+ * -----
+ * Usage:
+ *   int nBufferSize = 0;
+ *   status = cusparseIstableSortByKey_bufferSize(handle, n, &nBufferSize);
+ *   assert(CUSPARSE_STATUS_SUCCESS == status);
+ *   
+ *   int *pBuffer;
+ *   cudaStat = cudaMalloc((void**)&pBuffer, (size_t)nBufferSize);
+ *   assert(cudaSuccess == cudaStat);
+ *
+ *   d_P = 0:n-1 ;
+ *   status = cusparseIstableSortByKey(handle, n, d_csrRowPtrA, d_P, pBuffer);
+ *   assert(CUSPARSE_STATUS_SUCCESS == status);
+ *
+ */
+cusparseStatus_t CUSPARSEAPI cusparseIstableSortByKey(
+    cusparseHandle_t handle,
+    int n,
+    int *key,
+    int *P,
+    void *pBuffer);
+
+
+
+// ------------------- csr42csr ------------------
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsr42csr_bufferSize(
+    cusparseHandle_t handle,
+    int m,
+    int n,
+    const cusparseMatDescr_t descrA,
+    int nnzA,
+    const int *csrRowPtrA,
+    const int *csrEndPtrA,
+    size_t *pBufferSizeInByte );
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsr42csrRows(
+    cusparseHandle_t handle,
+    int m,
+    int n,
+    const cusparseMatDescr_t descrA,
+    int nnzA,
+    const int *csrRowPtrA,
+    const int *csrEndPtrA,
+    const int *csrColIndA,
+
+    const cusparseMatDescr_t descrC,
+    int *csrRowPtrC,
+    int *nnzTotalDevHostPtr,
+    void *pBuffer );
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsr42csrCols(
+    cusparseHandle_t handle,
+    int m,
+    int n,
+    const cusparseMatDescr_t descrA,
+    int nnzA,
+    const int *csrRowPtrA,
+    const int *csrEndPtrA,
+    const int *csrColIndA,
+
+    const cusparseMatDescr_t descrC,
+    const int *csrRowPtrC,
+    int *csrColIndC,
+    void *pBuffer );
+
+cusparseStatus_t CUSPARSEAPI cusparseScsr42csrVals(
+    cusparseHandle_t handle,
+    int m,
+    int n,
+    const float *alpha,
+    const cusparseMatDescr_t descrA,
+    int nnzA,
+    const float *csrValA,
+    const int *csrRowPtrA,
+    const int *csrEndPtrA,
+    const int *csrColIndA,
+
+    const cusparseMatDescr_t descrC,
+    float *csrValC,
+    const int *csrRowPtrC,
+    const int *csrEndPtrC,
+    const int *csrColIndC,
+    void *pBuffer );
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsr42csrVals(
+    cusparseHandle_t handle,
+    int m,
+    int n,
+    const double *alpha,
+    const cusparseMatDescr_t descrA,
+    int nnzA,
+    const double *csrValA,
+    const int *csrRowPtrA,
+    const int *csrEndPtrA,
+    const int *csrColIndA,
+
+    const cusparseMatDescr_t descrC,
+    double *csrValC,
+    const int *csrRowPtrC,
+    const int *csrEndPtrC,
+    const int *csrColIndC,
+    void *pBuffer );
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsr42csrVals(
+    cusparseHandle_t handle,
+    int m,
+    int n,
+    const cuComplex *alpha,
+    const cusparseMatDescr_t descrA,
+    int nnzA,
+    const cuComplex *csrValA,
+    const int *csrRowPtrA,
+    const int *csrEndPtrA,
+    const int *csrColIndA,
+
+    const cusparseMatDescr_t descrC,
+    cuComplex *csrValC,
+    const int *csrRowPtrC,
+    const int *csrEndPtrC,
+    const int *csrColIndC,
+    void *pBuffer );
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsr42csrVals(
+    cusparseHandle_t handle,
+    int m,
+    int n,
+    const cuDoubleComplex *alpha,
+    const cusparseMatDescr_t descrA,
+    int nnzA,
+    const cuDoubleComplex *csrValA,
+    const int *csrRowPtrA,
+    const int *csrEndPtrA,
+    const int *csrColIndA,
+
+    const cusparseMatDescr_t descrC,
+    cuDoubleComplex *csrValC,
+    const int *csrRowPtrC,
+    const int *csrEndPtrC,
+    const int *csrColIndC,
+    void *pBuffer );
+
+
+// ----- csrmv_hyb ------------------------------
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrmv_hyb(
+    cusparseHandle_t handle,
+    cusparseOperation_t trans,
+    int m,
+    int n,
+    int nnz,
+    const float *alpha,
+    const cusparseMatDescr_t descra,
+    const float *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    const float *x,
+    const float *beta,
+    float *y);
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrmv_hyb(
+    cusparseHandle_t handle,
+    cusparseOperation_t trans,
+    int m,
+    int n,
+    int nnz,
+    const double *alpha,
+    const cusparseMatDescr_t descra,
+    const double *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    const double *x,
+    const double *beta, 
+    double *y);
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrmv_hyb(
+    cusparseHandle_t handle,
+    cusparseOperation_t trans,
+    int m,
+    int n,
+    int nnz,
+    const cuComplex *alpha,
+    const cusparseMatDescr_t descra,
+    const cuComplex *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    const cuComplex *x,
+    const cuComplex *beta,
+    cuComplex *y);
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrmv_hyb(
+    cusparseHandle_t handle,
+    cusparseOperation_t trans,
+    int m,
+    int n,
+    int nnz,
+    const cuDoubleComplex *alpha,
+    const cusparseMatDescr_t descra,
+    const cuDoubleComplex *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    const cuDoubleComplex *x,
+    const cuDoubleComplex *beta,
+    cuDoubleComplex *y);
+
+
+// ------------- getrf_ilu ---------------------
+
+cusparseStatus_t CUSPARSEAPI cusparseSgetrf_ilu(
+    cusparseHandle_t handle,
+    const int submatrix_k,
+    const int n,
+    float *A,
+    const int *pattern,
+    const int lda,
+    int *d_status,
+    int enable_boost,
+    double *tol_ptr,
+    float *boost_ptr);
+
+cusparseStatus_t CUSPARSEAPI cusparseDgetrf_ilu(
+    cusparseHandle_t handle,
+    const int submatrix_k,
+    const int n,
+    double *A,
+    const int *pattern,
+    const int lda,
+    int *d_status,
+    int enable_boost,
+    double *tol_ptr,
+    double *boost_ptr);
+
+cusparseStatus_t CUSPARSEAPI cusparseCgetrf_ilu(
+    cusparseHandle_t handle,
+    const int submatrix_k,
+    const int n,
+    cuComplex *A,
+    const int *pattern,
+    const int lda,
+    int *d_status,
+    int enable_boost,
+    double *tol_ptr,
+    cuComplex *boost_ptr);
+
+cusparseStatus_t CUSPARSEAPI cusparseZgetrf_ilu(
+    cusparseHandle_t handle,
+    const int submatrix_k,
+    const int n,
+    cuDoubleComplex *A,
+    const int *pattern,
+    const int lda,
+    int *d_status,
+    int enable_boost,
+    double *tol_ptr,
+    cuDoubleComplex *boost_ptr);
+
+
+// ------------- potrf_ic ---------------------
+
+cusparseStatus_t CUSPARSEAPI cusparseSpotrf_ic(
+    cusparseHandle_t handle,
+    const int submatrix_k,
+    const int n,
+    float *A,
+    const int *pattern,
+    const int lda,
+    int *d_status);
+
+cusparseStatus_t CUSPARSEAPI cusparseDpotrf_ic(
+    cusparseHandle_t handle,
+    const int submatrix_k,
+    const int n,
+    double *A,
+    const int *pattern,
+    const int lda,
+    int *d_status);
+
+cusparseStatus_t CUSPARSEAPI cusparseCpotrf_ic(
+    cusparseHandle_t handle,
+    const int submatrix_k,
+    const int n,
+    cuComplex *A,
+    const int *pattern,
+    const int lda,
+    int *d_status);
+
+cusparseStatus_t CUSPARSEAPI cusparseZpotrf_ic(
+    cusparseHandle_t handle,
+    const int submatrix_k,
+    const int n,
+    cuDoubleComplex *A,
+    const int *pattern,
+    const int lda,
+    int *d_status);
+
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsric02_denseConfig(
+    csric02Info_t info,
+    int enable_dense_block,
+    int max_dim_dense_block,
+    int threshold_dense_block,
+    double ratio);
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsric02_workspaceConfig(
+    csric02Info_t info,
+    int disable_workspace_limit);
+
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrilu02_denseConfig(
+    csrilu02Info_t info,
+    int enable_dense_block,
+    int max_dim_dense_block,
+    int threshold_dense_block,
+    double ratio);
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrilu02_workspaceConfig(
+    csrilu02Info_t info,
+    int disable_workspace_limit);
+
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrilu02Batch_denseConfig(
+    csrilu02BatchInfo_t info,
+    int enable_dense_block,
+    int max_dim_dense_block,
+    int threshold_dense_block,
+    double ratio);
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrilu02Batch_workspaceConfig(
+    csrilu02BatchInfo_t info,
+    int disable_workspace_limit);
+
+
+
+// ---------------- csric02 internal ----------------
+cusparseStatus_t CUSPARSEAPI cusparseXcsric02_getLevel(
+    csric02Info_t info,
+    int **level_ref);
+
+cusparseStatus_t CUSPARSEAPI cusparseScsric02_internal(
+    cusparseHandle_t handle,
+    int enable_potrf,
+    int dense_block_start,
+    //int dense_block_dim, // = m - dense_block_start
+    int dense_block_lda,
+    int *level,  // level is a permutation vector of 0:(m-1)
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    float *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    csric02Info_t info,
+    cusparseSolvePolicy_t policy,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsric02_internal(
+    cusparseHandle_t handle,
+    int enable_potrf,
+    int dense_block_start,
+    //int dense_block_dim, // = m - dense_block_start
+    int dense_block_lda,
+    int *level,  // level is a permutation vector of 0:(m-1)
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    double *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    csric02Info_t info,
+    cusparseSolvePolicy_t policy,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsric02_internal(
+    cusparseHandle_t handle,
+    int enable_potrf,
+    int dense_block_start,
+    //int dense_block_dim, // = m - dense_block_start
+    int dense_block_lda,
+    int *level,  // level is a permutation vector of 0:(m-1)
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    cuComplex *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    csric02Info_t info,
+    cusparseSolvePolicy_t policy,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsric02_internal(
+    cusparseHandle_t handle,
+    int enable_potrf,
+    int dense_block_start,
+    //int dense_block_dim, // = m - dense_block_start
+    int dense_block_lda,
+    int *level,  // level is a permutation vector of 0:(m-1)
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    cuDoubleComplex *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    csric02Info_t info,
+    cusparseSolvePolicy_t policy,
+    void *pBuffer);
+
+// csrilu02 internal
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrilu02_getLevel(
+    csrilu02Info_t info,
+    int **level_ref);
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrilu02_getCsrEndPtrL(
+    csrilu02Info_t info,
+    int **csrEndPtrL_ref);
+
+
+// ----------------- batch ilu0 -----------------
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsrilu02BatchInfo(
+    csrilu02BatchInfo_t *info);
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrilu02BatchInfo(
+    csrilu02BatchInfo_t info);
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrilu02Batch_zeroPivot(
+    cusparseHandle_t handle,
+    csrilu02BatchInfo_t info,
+    int *position);
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrilu02Batch_numericBoost(
+    cusparseHandle_t handle,
+    csrilu02BatchInfo_t info,
+    int enable_boost,
+    double *tol,
+    float *numeric_boost);
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02Batch_numericBoost(
+    cusparseHandle_t handle,
+    csrilu02BatchInfo_t info,
+    int enable_boost,
+    double *tol,
+    double *numeric_boost);
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02Batch_numericBoost(
+    cusparseHandle_t handle,
+    csrilu02BatchInfo_t info,
+    int enable_boost,
+    double *tol,
+    cuComplex *numeric_boost);
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02Batch_numericBoost(
+    cusparseHandle_t handle,
+    csrilu02BatchInfo_t info,
+    int enable_boost,
+    double *tol,
+    cuDoubleComplex *numeric_boost);
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrilu02Batch_bufferSizeExt(
+    cusparseHandle_t handle,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    float *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    int batchSize,
+    csrilu02BatchInfo_t info,
+    size_t *pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02Batch_bufferSizeExt(
+    cusparseHandle_t handle,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    double *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    int batchSize,
+    csrilu02BatchInfo_t info,
+    size_t *pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02Batch_bufferSizeExt(
+    cusparseHandle_t handle,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    cuComplex *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    int batchSize,
+    csrilu02BatchInfo_t info,
+    size_t *pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02Batch_bufferSizeExt(
+    cusparseHandle_t handle,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    cuDoubleComplex *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    int batchSize,
+    csrilu02BatchInfo_t info,
+    size_t *pBufferSizeInBytes);
+
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrilu02Batch_analysis(
+    cusparseHandle_t handle,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    const float *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    int batchSize,
+    csrilu02BatchInfo_t info,
+    cusparseSolvePolicy_t policy,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02Batch_analysis(
+    cusparseHandle_t handle,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    const double *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    int batchSize,
+    csrilu02BatchInfo_t info,
+    cusparseSolvePolicy_t policy,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02Batch_analysis(
+    cusparseHandle_t handle,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    int batchSize,
+    csrilu02BatchInfo_t info,
+    cusparseSolvePolicy_t policy,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02Batch_analysis(
+    cusparseHandle_t handle,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    int batchSize,
+    csrilu02BatchInfo_t info,
+    cusparseSolvePolicy_t policy,
+    void *pBuffer);
+
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrilu02Batch(
+    cusparseHandle_t handle,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descra,
+    float *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    int batchSize,
+    csrilu02BatchInfo_t info,
+    cusparseSolvePolicy_t policy,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02Batch(
+    cusparseHandle_t handle,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descra,
+    double *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    int batchSize,
+    csrilu02BatchInfo_t info,
+    cusparseSolvePolicy_t policy,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02Batch(
+    cusparseHandle_t handle,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descra,
+    cuComplex *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    int batchSize,
+    csrilu02BatchInfo_t info,
+    cusparseSolvePolicy_t policy,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02Batch(
+    cusparseHandle_t handle,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descra,
+    cuDoubleComplex *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    int batchSize,
+    csrilu02BatchInfo_t info,
+    cusparseSolvePolicy_t policy,
+    void *pBuffer);
+
+// --------------- csrsv2 batch --------------
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrsv2Batch_bufferSizeExt(
+    cusparseHandle_t handle,
+    cusparseOperation_t transA,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    float *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    int batchSize,
+    csrsv2Info_t info,
+    size_t *pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2Batch_bufferSizeExt(
+    cusparseHandle_t handle,
+    cusparseOperation_t transA,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    double *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    int batchSize,
+    csrsv2Info_t info,
+    size_t *pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2Batch_bufferSizeExt(
+    cusparseHandle_t handle,
+    cusparseOperation_t transA,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    cuComplex *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    int batchSize,
+    csrsv2Info_t info,
+    size_t *pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2Batch_bufferSizeExt(
+    cusparseHandle_t handle,
+    cusparseOperation_t transA,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    cuDoubleComplex *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    int batchSize,
+    csrsv2Info_t info,
+    size_t *pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrsv2Batch_analysis(
+    cusparseHandle_t handle,
+    cusparseOperation_t transA,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    const float *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    int batchSize,
+    csrsv2Info_t info,
+    cusparseSolvePolicy_t policy,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2Batch_analysis(
+    cusparseHandle_t handle,
+    cusparseOperation_t transA,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    const double *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    int batchSize,
+    csrsv2Info_t info,
+    cusparseSolvePolicy_t policy,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2Batch_analysis(
+    cusparseHandle_t handle,
+    cusparseOperation_t transA,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    int batchSize,
+    csrsv2Info_t info,
+    cusparseSolvePolicy_t policy,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2Batch_analysis(
+    cusparseHandle_t handle,
+    cusparseOperation_t transA,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    int batchSize,
+    csrsv2Info_t info,
+    cusparseSolvePolicy_t policy,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrsv2Batch_zeroPivot(
+    cusparseHandle_t handle,
+    csrsv2Info_t info,
+    int *position);
+
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrsv2Batch_solve(
+    cusparseHandle_t handle,
+    cusparseOperation_t trans,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descra,
+    const float *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    csrsv2Info_t info,
+    const float *x,
+    float *y,
+    int batchSize,
+    cusparseSolvePolicy_t policy,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2Batch_solve(
+    cusparseHandle_t handle,
+    cusparseOperation_t trans,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descra,
+    const double *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    csrsv2Info_t info,
+    const double *x,
+    double *y,
+    int batchSize,
+    cusparseSolvePolicy_t policy,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2Batch_solve(
+    cusparseHandle_t handle,
+    cusparseOperation_t trans,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descra,
+    const cuComplex *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    csrsv2Info_t info,
+    const cuComplex *x,
+    cuComplex *y,
+    int batchSize,
+    cusparseSolvePolicy_t policy,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2Batch_solve(
+    cusparseHandle_t handle,
+    cusparseOperation_t trans,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descra,
+    const cuDoubleComplex *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    csrsv2Info_t info,
+    const cuDoubleComplex *x,
+    cuDoubleComplex *y,
+    int batchSize,
+    cusparseSolvePolicy_t policy,
+    void *pBuffer);
+
+//-------------- csrgemm2 -------------
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrgemm2_spaceConfig(
+    csrgemm2Info_t info,
+    int disable_space_limit);
+
+// internal-use only
+cusparseStatus_t CUSPARSEAPI cusparseXcsrgemm2Rows_bufferSize(
+    cusparseHandle_t handle,
+    int m,
+    int n,
+    int k,
+
+    const cusparseMatDescr_t descrA,
+    int nnzA,
+    const int *csrRowPtrA,
+    const int *csrColIndA,
+
+    const cusparseMatDescr_t descrB,
+    int nnzB,
+    const int *csrRowPtrB,
+    const int *csrColIndB,
+
+    csrgemm2Info_t info,
+    size_t *pBufferSize );
+
+// internal-use only
+cusparseStatus_t CUSPARSEAPI cusparseXcsrgemm2Cols_bufferSize(
+    cusparseHandle_t handle,
+    int m,
+    int n,
+    int k,
+
+    const cusparseMatDescr_t descrA,
+    int nnzA,
+    const int *csrRowPtrA,
+    const int *csrColIndA,
+
+    const cusparseMatDescr_t descrB,
+    int nnzB,
+    const int *csrRowPtrB,
+    const int *csrColIndB,
+
+    csrgemm2Info_t info,
+    size_t *pBufferSize );
+
+
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrgemm2Rows(
+    cusparseHandle_t handle,
+    int m,
+    int n,
+    int k,
+
+    const cusparseMatDescr_t descrA,
+    int nnzA,
+    const int *csrRowPtrA,
+    const int *csrEndPtrA,
+    const int *csrColIndA,
+
+    const cusparseMatDescr_t descrB,
+    int nnzB,
+    const int *csrRowPtrB,
+    const int *csrEndPtrB,
+    const int *csrColIndB,
+
+    const cusparseMatDescr_t descrD,
+    int nnzD,
+    const int *csrRowPtrD,
+    const int *csrEndPtrD,
+    const int *csrColIndD,
+
+    const cusparseMatDescr_t descrC,
+    int *csrRowPtrC,
+
+    int *nnzTotalDevHostPtr,
+    csrgemm2Info_t info,
+    void *pBuffer );
+
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrgemm2Cols(
+    cusparseHandle_t handle,
+    int m,
+    int n,
+    int k,
+
+    const cusparseMatDescr_t descrA,
+    int nnzA,
+    const int *csrRowPtrA,
+    const int *csrEndPtrA,
+    const int *csrColIndA,
+
+    const cusparseMatDescr_t descrB,
+    int nnzB,
+    const int *csrRowPtrB,
+    const int *csrEndPtrB,
+    const int *csrColIndB,
+
+    const cusparseMatDescr_t descrD,
+    int nnzD,
+    const int *csrRowPtrD,
+    const int *csrEndPtrD,
+    const int *csrColIndD,
+
+    const cusparseMatDescr_t descrC,
+    const int *csrRowPtrC,
+    int *csrColIndC,
+
+    csrgemm2Info_t info,
+    void *pBuffer );
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrgemm2Vals(
+    cusparseHandle_t handle,
+    int m,
+    int n,
+    int k,
+
+    const float *alpha,
+
+    const cusparseMatDescr_t descrA,
+    int nnzA,
+    const float *csrValA,
+    const int *csrRowPtrA,
+    const int *csrEndPtrA,
+    const int *csrColIndA,
+
+    const cusparseMatDescr_t descrB,
+    int nnzB,
+    const float *csrValB,
+    const int *csrRowPtrB,
+    const int *csrEndPtrB,
+    const int *csrColIndB,
+
+    const cusparseMatDescr_t descrD,
+    int nnzD,
+    const float *csrValD,
+    const int *csrRowPtrD,
+    const int *csrEndPtrD,
+    const int *csrColIndD,
+
+    const float *beta,
+
+    const cusparseMatDescr_t descrC,
+    float *csrValC,
+    const int *csrRowPtrC,
+    const int *csrEndPtrC,
+    const int *csrColIndC,
+
+    csrgemm2Info_t info,
+    void *pBuffer );
+
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrgemm2Vals(
+    cusparseHandle_t handle,
+    int m,
+    int n,
+    int k,
+
+    const double *alpha,
+
+    const cusparseMatDescr_t descrA,
+    int nnzA,
+    const double *csrValA,
+    const int *csrRowPtrA,
+    const int *csrEndPtrA,
+    const int *csrColIndA,
+
+    const cusparseMatDescr_t descrB,
+    int nnzB,
+    const double *csrValB,
+    const int *csrRowPtrB,
+    const int *csrEndPtrB,
+    const int *csrColIndB,
+
+    const cusparseMatDescr_t descrD,
+    int nnzD,
+    const double *csrValD,
+    const int *csrRowPtrD,
+    const int *csrEndPtrD,
+    const int *csrColIndD,
+
+    const double *beta,
+
+    const cusparseMatDescr_t descrC,
+    double *csrValC,
+    const int *csrRowPtrC,
+    const int *csrEndPtrC,
+    const int *csrColIndC,
+
+    csrgemm2Info_t info,
+    void *pBuffer );
+
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrgemm2Vals(
+    cusparseHandle_t handle,
+    int m,
+    int n,
+    int k,
+
+    const cuComplex *alpha,
+
+    const cusparseMatDescr_t descrA,
+    int nnzA,
+    const cuComplex *csrValA,
+    const int *csrRowPtrA,
+    const int *csrEndPtrA,
+    const int *csrColIndA,
+
+    const cusparseMatDescr_t descrB,
+    int nnzB,
+    const cuComplex *csrValB,
+    const int *csrRowPtrB,
+    const int *csrEndPtrB,
+    const int *csrColIndB,
+
+    const cusparseMatDescr_t descrD,
+    int nnzD,
+    const cuComplex *csrValD,
+    const int *csrRowPtrD,
+    const int *csrEndPtrD,
+    const int *csrColIndD,
+
+    const cuComplex *beta,
+
+    const cusparseMatDescr_t descrC,
+    cuComplex *csrValC,
+    const int *csrRowPtrC,
+    const int *csrEndPtrC,
+    const int *csrColIndC,
+
+    csrgemm2Info_t info,
+    void *pBuffer );
+
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrgemm2Vals(
+    cusparseHandle_t handle,
+    int m,
+    int n,
+    int k,
+
+    const cuDoubleComplex *alpha,
+
+    const cusparseMatDescr_t descrA,
+    int nnzA,
+    const cuDoubleComplex *csrValA,
+    const int *csrRowPtrA,
+    const int *csrEndPtrA,
+    const int *csrColIndA,
+
+    const cusparseMatDescr_t descrB,
+    int nnzB,
+    const cuDoubleComplex *csrValB,
+    const int *csrRowPtrB,
+    const int *csrEndPtrB,
+    const int *csrColIndB,
+
+    const cusparseMatDescr_t descrD,
+    int nnzD,
+    const cuDoubleComplex *csrValD,
+    const int *csrRowPtrD,
+    const int *csrEndPtrD,
+    const int *csrColIndD,
+
+    const cuDoubleComplex *beta,
+
+    const cusparseMatDescr_t descrC,
+    cuDoubleComplex *csrValC,
+    const int *csrRowPtrC,
+    const int *csrEndPtrC,
+    const int *csrColIndC,
+
+    csrgemm2Info_t info,
+    void *pBuffer );
+
+
+// ---------------- csr2csc2
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsr2csc2_bufferSizeExt(
+    cusparseHandle_t handle,
+    int m,
+    int n,
+    int nnz,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    size_t *pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsr2csc2(
+    cusparseHandle_t handle,
+    int m,
+    int n,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    int *cscColPtr,
+    int *cscRowInd,
+    int *cscValInd,
+    void *pBuffer);
+
+#if 0
+// ------------- CSC ILU0
+
+cusparseStatus_t CUSPARSEAPI cusparseXcscilu02_getLevel(
+    cscilu02Info_t info,
+    int **level_ref);
+
+cusparseStatus_t CUSPARSEAPI cusparseXcscilu02_getCscColPtrL(
+    cscilu02Info_t info,
+    int **cscColPtrL_ref);
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCscilu02Info(
+    cscilu02Info_t *info);
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyCscilu02Info(
+    cscilu02Info_t info);
+
+cusparseStatus_t CUSPARSEAPI cusparseXcscilu02_zeroPivot(
+    cusparseHandle_t handle,
+    cscilu02Info_t info,
+    int *position);
+
+cusparseStatus_t CUSPARSEAPI cusparseScscilu02_numericBoost(
+    cusparseHandle_t handle,
+    cscilu02Info_t info,
+    int enable_boost,
+    double *tol,
+    float *numeric_boost);
+
+cusparseStatus_t CUSPARSEAPI cusparseDcscilu02_numericBoost(
+    cusparseHandle_t handle,
+    cscilu02Info_t info,
+    int enable_boost,
+    double *tol,
+    double *numeric_boost);
+
+cusparseStatus_t CUSPARSEAPI cusparseCcscilu02_numericBoost(
+    cusparseHandle_t handle,
+    cscilu02Info_t info,
+    int enable_boost,
+    double *tol,
+    cuComplex *numeric_boost);
+
+cusparseStatus_t CUSPARSEAPI cusparseZcscilu02_numericBoost(
+    cusparseHandle_t handle,
+    cscilu02Info_t info,
+    int enable_boost,
+    double *tol,
+    cuDoubleComplex *numeric_boost);
+
+cusparseStatus_t CUSPARSEAPI cusparseScscilu02_bufferSize(
+    cusparseHandle_t handle,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    float *cscVal,
+    const int *cscColPtr,
+    const int *cscEndPtr,
+    const int *cscRowInd,
+    cscilu02Info_t info,
+    int *pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI cusparseDcscilu02_bufferSize(
+    cusparseHandle_t handle,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    double *cscVal,
+    const int *cscColPtr,
+    const int *cscEndPtr,
+    const int *cscRowInd,
+    cscilu02Info_t info,
+    int *pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI cusparseCcscilu02_bufferSize(
+    cusparseHandle_t handle,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    cuComplex *cscVal,
+    const int *cscColPtr,
+    const int *cscEndPtr,
+    const int *cscRowInd,
+    cscilu02Info_t info,
+    int *pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI cusparseZcscilu02_bufferSize(
+    cusparseHandle_t handle,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    cuDoubleComplex *cscVal,
+    const int *cscColPtr,
+    const int *cscEndPtr,
+    const int *cscRowInd,
+    cscilu02Info_t info,
+    int *pBufferSizeInBytes);
+
+
+cusparseStatus_t CUSPARSEAPI cusparseScscilu02_analysis(
+    cusparseHandle_t handle,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    const float *cscVal,
+    const int *cscColPtr,
+    const int *cscEndPtr,
+    const int *cscRowInd,
+    cscilu02Info_t info,
+    cusparseSolvePolicy_t policy,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseDcscilu02_analysis(
+    cusparseHandle_t handle,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    const double *cscVal,
+    const int *cscColPtr,
+    const int *cscEndPtr,
+    const int *cscRowInd,
+    cscilu02Info_t info,
+    cusparseSolvePolicy_t policy,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseCcscilu02_analysis(
+    cusparseHandle_t handle,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *cscVal,
+    const int *cscColPtr,
+    const int *cscEndPtr,
+    const int *cscRowInd,
+    cscilu02Info_t info,
+    cusparseSolvePolicy_t policy,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseZcscilu02_analysis(
+    cusparseHandle_t handle,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *cscVal,
+    const int *cscColPtr,
+    const int *cscEndPtr,
+    const int *cscRowInd,
+    cscilu02Info_t info,
+    cusparseSolvePolicy_t policy,
+    void *pBuffer);
+
+
+cusparseStatus_t CUSPARSEAPI cusparseScscilu02(
+    cusparseHandle_t handle,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    float *cscVal,
+    const int *cscColPtr,
+    const int *cscEndPtr,
+    const int *cscRowInd,
+    cscilu02Info_t info,
+    cusparseSolvePolicy_t policy,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseDcscilu02(
+    cusparseHandle_t handle,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    double *cscVal,
+    const int *cscColPtr,
+    const int *cscEndPtr,
+    const int *cscRowInd,
+    cscilu02Info_t info,
+    cusparseSolvePolicy_t policy,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseCcscilu02(
+    cusparseHandle_t handle,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    cuComplex *cscVal,
+    const int *cscColPtr,
+    const int *cscEndPtr,
+    const int *cscRowInd,
+    cscilu02Info_t info,
+    cusparseSolvePolicy_t policy,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseZcscilu02(
+    cusparseHandle_t handle,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    cuDoubleComplex *cscVal,
+    const int *cscColPtr,
+    const int *cscEndPtr,
+    const int *cscRowInd,
+    cscilu02Info_t info,
+    cusparseSolvePolicy_t policy,
+    void *pBuffer);
+#endif
+
+// ------------- csrxjusqua
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrxjusqua(
+    cusparseHandle_t handle,
+    int iax,
+    int iay,
+    int m,
+    int n,
+    const cusparseMatDescr_t descrA,
+    const int *csrRowPtrA,
+    const int *csrEndPtrA,
+    const int *csrColIndA,
+    int *csrjusqua );
+
+// ------------ csrxilu0
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsrxilu0Info(
+    csrxilu0Info_t *info);
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrxilu0Info(
+    csrxilu0Info_t info);
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrxilu0_zeroPivot(
+    cusparseHandle_t handle,
+    csrxilu0Info_t info,
+    int *position);
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrxilu0_numericBoost(
+    cusparseHandle_t handle,
+    csrxilu0Info_t info,
+    int enable_boost,
+    double *tol,
+    float *numeric_boost);
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrxilu0_numericBoost(
+    cusparseHandle_t handle,
+    csrxilu0Info_t info,
+    int enable_boost,
+    double *tol,
+    double *numeric_boost);
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrxilu0_numericBoost(
+    cusparseHandle_t handle,
+    csrxilu0Info_t info,
+    int enable_boost,
+    double *tol,
+    cuComplex *numeric_boost);
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrxilu0_numericBoost(
+    cusparseHandle_t handle,
+    csrxilu0Info_t info,
+    int enable_boost,
+    double *tol,
+    cuDoubleComplex *numeric_boost);
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrxilu0_bufferSizeExt(
+    cusparseHandle_t handle,
+    int iax,
+    int iay,
+    int m,
+    int n,
+    int k,
+    const cusparseMatDescr_t descrA,
+    const int *csrRowPtr,
+    const int *csrEndPtr,
+    const int *csrColInd,
+    csrxilu0Info_t info,
+    size_t *pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrxilu0(
+    cusparseHandle_t handle,
+    int iax,
+    int iay,
+    int m,
+    int n,
+    int k,
+    const cusparseMatDescr_t descrA,
+    float *csrVal,
+    const int *csrRowPtr,
+    const int *csrEndPtr,
+    const int *csrColInd,
+    csrxilu0Info_t info,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrxilu0(
+    cusparseHandle_t handle,
+    int iax,
+    int iay,
+    int m,
+    int n,
+    int k,
+    const cusparseMatDescr_t descrA,
+    double *csrVal,
+    const int *csrRowPtr,
+    const int *csrEndPtr,
+    const int *csrColInd,
+    csrxilu0Info_t info,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrxilu0(
+    cusparseHandle_t handle,
+    int iax,
+    int iay,
+    int m,
+    int n,
+    int k,
+    const cusparseMatDescr_t descrA,
+    cuComplex *csrVal,
+    const int *csrRowPtr,
+    const int *csrEndPtr,
+    const int *csrColInd,
+    csrxilu0Info_t info,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrxilu0(
+    cusparseHandle_t handle,
+    int iax,
+    int iay,
+    int m,
+    int n,
+    int k,
+    const cusparseMatDescr_t descrA,
+    cuDoubleComplex *csrVal,
+    const int *csrRowPtr,
+    const int *csrEndPtr,
+    const int *csrColInd,
+    csrxilu0Info_t info,
+    void *pBuffer);
+
+// ----------- csrxgemmSchur
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsrxgemmSchurInfo(
+    csrxgemmSchurInfo_t *info);
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrxgemmSchurInfo(
+    csrxgemmSchurInfo_t info);
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrxgemmSchur_bufferSizeExt(
+    cusparseHandle_t handle,
+    int m,
+    int n,
+    int k,
+
+    int iax,
+    int iay,
+    const cusparseMatDescr_t descrA,
+    int nnzA,
+    const int *csrRowPtrA,
+    const int *csrEndPtrA,
+    const int *csrColIndA,
+
+    int ibx,
+    int iby,
+    const cusparseMatDescr_t descrB,
+    int nnzB,
+    const int *csrRowPtrB,
+    const int *csrEndPtrB,
+    const int *csrColIndB,
+
+    int icx,
+    int icy,
+    const cusparseMatDescr_t descrC,
+    int nnzC,
+    const int *csrRowPtrC,
+    const int *csrEndPtrC,
+    const int *csrColIndC,
+
+    csrxgemmSchurInfo_t info,
+    size_t *pBufferSizeInBytes);
+
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrxgemmSchur(
+    cusparseHandle_t handle,
+    int m,
+    int n,
+    int k,
+
+    int iax,
+    int iay,
+    const cusparseMatDescr_t descrA,
+    int nnzA,
+    const float *csrValA,
+    const int *csrRowPtrA,
+    const int *csrEndPtrA,
+    const int *csrColIndA,
+
+    int ibx,
+    int iby,
+    const cusparseMatDescr_t descrB,
+    int nnzB,
+    const float *csrValB,
+    const int *csrRowPtrB,
+    const int *csrEndPtrB,
+    const int *csrColIndB,
+
+    int icx,
+    int icy,
+    const cusparseMatDescr_t descrC,
+    int nnzC,
+    float *csrValC,
+    const int *csrRowPtrC,
+    const int *csrEndPtrC,
+    const int *csrColIndC,
+
+    csrxgemmSchurInfo_t info,
+    void *pBuffer);
+
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrxgemmSchur(
+    cusparseHandle_t handle,
+    int m,
+    int n,
+    int k,
+
+    int iax,
+    int iay,
+    const cusparseMatDescr_t descrA,
+    int nnzA,
+    const double *csrValA,
+    const int *csrRowPtrA,
+    const int *csrEndPtrA,
+    const int *csrColIndA,
+
+    int ibx,
+    int iby,
+    const cusparseMatDescr_t descrB,
+    int nnzB,
+    const double *csrValB,
+    const int *csrRowPtrB,
+    const int *csrEndPtrB,
+    const int *csrColIndB,
+
+    int icx,
+    int icy,
+    const cusparseMatDescr_t descrC,
+    int nnzC,
+    double *csrValC,
+    const int *csrRowPtrC,
+    const int *csrEndPtrC,
+    const int *csrColIndC,
+
+    csrxgemmSchurInfo_t info,
+    void *pBuffer);
+
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrxgemmSchur(
+    cusparseHandle_t handle,
+    int m,
+    int n,
+    int k,
+
+    int iax,
+    int iay,
+    const cusparseMatDescr_t descrA,
+    int nnzA,
+    const cuComplex *csrValA,
+    const int *csrRowPtrA,
+    const int *csrEndPtrA,
+    const int *csrColIndA,
+
+    int ibx,
+    int iby,
+    const cusparseMatDescr_t descrB,
+    int nnzB,
+    const cuComplex *csrValB,
+    const int *csrRowPtrB,
+    const int *csrEndPtrB,
+    const int *csrColIndB,
+
+    int icx,
+    int icy,
+    const cusparseMatDescr_t descrC,
+    int nnzC,
+    cuComplex *csrValC,
+    const int *csrRowPtrC,
+    const int *csrEndPtrC,
+    const int *csrColIndC,
+
+    csrxgemmSchurInfo_t info,
+    void *pBuffer);
+
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrxgemmSchur(
+    cusparseHandle_t handle,
+    int m,
+    int n,
+    int k,
+
+    int iax,
+    int iay,
+    const cusparseMatDescr_t descrA,
+    int nnzA,
+    const cuDoubleComplex *csrValA,
+    const int *csrRowPtrA,
+    const int *csrEndPtrA,
+    const int *csrColIndA,
+
+    int ibx,
+    int iby,
+    const cusparseMatDescr_t descrB,
+    int nnzB,
+    const cuDoubleComplex *csrValB,
+    const int *csrRowPtrB,
+    const int *csrEndPtrB,
+    const int *csrColIndB,
+
+    int icx,
+    int icy,
+    const cusparseMatDescr_t descrC,
+    int nnzC,
+    cuDoubleComplex *csrValC,
+    const int *csrRowPtrC,
+    const int *csrEndPtrC,
+    const int *csrColIndC,
+
+    csrxgemmSchurInfo_t info,
+    void *pBuffer);
+
+// ---------- csrxtrsm
+
+#if 0
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsrxtrsmInfo(
+    csrxtrsmInfo_t *info);
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrxtrsmInfo(
+    csrxtrsmInfo_t info);
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrxtrsm_bufferSizeExt(
+    cusparseHandle_t handle,
+    int m,
+    int n,
+
+    cusparseSideMode_t side,
+
+    int iax,
+    int iay,
+    const cusparseMatDescr_t descrA,
+    const int *csrRowPtrA,
+    const int *csrEndPtrA,
+    const int *csrColIndA,
+
+    int ibx,
+    int iby,
+    const cusparseMatDescr_t descrB,
+    const int *csrRowPtrB,
+    const int *csrEndPtrB,
+    const int *csrColIndB,
+
+    csrxtrsmInfo_t info,
+    size_t *pBufferSizeInBytes);
+
+cusparseStatus_t  CUSPARSEAPI cusparseScsrxtrsm(
+    cusparseHandle_t handle,
+
+    int m,
+    int n,
+
+    cusparseSideMode_t side,
+
+    int iax,
+    int iay,
+    const cusparseMatDescr_t descrA,
+    const float *csrValA,
+    const int *csrRowPtrA,
+    const int *csrEndPtrA,
+    const int *csrColIndA,
+
+    int ibx,
+    int iby,
+    const cusparseMatDescr_t descrB,
+    float *csrValB,
+    const int *csrRowPtrB,
+    const int *csrEndPtrB,
+    const int *csrColIndB,
+
+    csrxtrsmInfo_t info,
+    void *pBuffer);
+
+cusparseStatus_t  CUSPARSEAPI cusparseDcsrxtrsm(
+    cusparseHandle_t handle,
+
+    int m,
+    int n,
+
+    cusparseSideMode_t side,
+
+    int iax,
+    int iay,
+    const cusparseMatDescr_t descrA,
+    const double *csrValA,
+    const int *csrRowPtrA,
+    const int *csrEndPtrA,
+    const int *csrColIndA,
+
+    int ibx,
+    int iby,
+    const cusparseMatDescr_t descrB,
+    double *csrValB,
+    const int *csrRowPtrB,
+    const int *csrEndPtrB,
+    const int *csrColIndB,
+
+    csrxtrsmInfo_t info,
+    void *pBuffer);
+
+cusparseStatus_t  CUSPARSEAPI cusparseCcsrxtrsm(
+    cusparseHandle_t handle,
+
+    int m,
+    int n,
+
+    cusparseSideMode_t side,
+
+    int iax,
+    int iay,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *csrValA,
+    const int *csrRowPtrA,
+    const int *csrEndPtrA,
+    const int *csrColIndA,
+
+    int ibx,
+    int iby,
+    const cusparseMatDescr_t descrB,
+    cuComplex *csrValB,
+    const int *csrRowPtrB,
+    const int *csrEndPtrB,
+    const int *csrColIndB,
+
+    csrxtrsmInfo_t info,
+    void *pBuffer);
+
+
+cusparseStatus_t  CUSPARSEAPI cusparseZcsrxtrsm(
+    cusparseHandle_t handle,
+
+    int m,
+    int n,
+
+    cusparseSideMode_t side,
+
+    int iax,
+    int iay,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrValA,
+    const int *csrRowPtrA,
+    const int *csrEndPtrA,
+    const int *csrColIndA,
+
+    int ibx,
+    int iby,
+    const cusparseMatDescr_t descrB,
+    cuDoubleComplex *csrValB,
+    const int *csrRowPtrB,
+    const int *csrEndPtrB,
+    const int *csrColIndB,
+
+    csrxtrsmInfo_t info,
+    void *pBuffer);
+#endif
+
+// ------ CSR ilu03
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsrilu03Info(
+    csrilu03Info_t *info);
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrilu03Info(
+    csrilu03Info_t info);
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrilu03_bufferSizeExt(
+    cusparseHandle_t handle,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    csrilu03Info_t info,
+    size_t *pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrilu03_zeroPivot(
+    cusparseHandle_t handle,
+    csrilu03Info_t info,
+    int *position);
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrilu03_numericBoost(
+    cusparseHandle_t handle,
+    csrilu03Info_t info,
+    int enable_boost,
+    double *tol,
+    float *numeric_boost);
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrilu03_numericBoost(
+    cusparseHandle_t handle,
+    csrilu03Info_t info,
+    int enable_boost,
+    double *tol,
+    double *numeric_boost);
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrilu03_numericBoost(
+    cusparseHandle_t handle,
+    csrilu03Info_t info,
+    int enable_boost,
+    double *tol,
+    cuComplex *numeric_boost);
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrilu03_numericBoost(
+    cusparseHandle_t handle,
+    csrilu03Info_t info,
+    int enable_boost,
+    double *tol,
+    cuDoubleComplex *numeric_boost);
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrilu03(
+    cusparseHandle_t handle,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    float *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    csrilu03Info_t info,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrilu03(
+    cusparseHandle_t handle,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    double *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    csrilu03Info_t info,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrilu03(
+    cusparseHandle_t handle,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    cuComplex *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    csrilu03Info_t info,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrilu03(
+    cusparseHandle_t handle,
+    int m,
+    int nnz,
+    const cusparseMatDescr_t descrA,
+    cuDoubleComplex *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    csrilu03Info_t info,
+    void *pBuffer);
+
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrValid(
+    cusparseHandle_t handle,
+    int m,
+    int n,
+    int nnzA,
+    const cusparseMatDescr_t descrA,
+    const int *csrRowPtrA,
+    const int *csrColIndA,
+    int *valid);
+
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrmm3(
+    cusparseHandle_t handle,
+    cusparseOperation_t transa,
+    cusparseOperation_t transb,
+    int m,
+    int n,
+    int k,
+    int nnz,
+    const float *alpha,
+    const cusparseMatDescr_t descrA,
+    const float *csrValA,
+    const int *csrRowPtrA,
+    const int *csrColIndA,
+    const float *B,
+    int ldb,
+    const float *beta,
+    float *C,
+    int ldc,
+    void *buffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrmm3(
+    cusparseHandle_t handle,
+    cusparseOperation_t transa,
+    cusparseOperation_t transb,
+    int m,
+    int n,
+    int k,
+    int nnz,
+    const double *alpha,
+    const cusparseMatDescr_t descrA,
+    const double *csrValA,
+    const int *csrRowPtrA,
+    const int *csrColIndA,
+    const double *B,
+    int ldb,
+    const double *beta,
+    double *C,
+    int ldc,
+    void *buffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrmm3(
+    cusparseHandle_t handle,
+    cusparseOperation_t transa,
+    cusparseOperation_t transb,
+    int m,
+    int n,
+    int k,
+    int nnz,
+    const cuComplex *alpha,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *csrValA,
+    const int *csrRowPtrA,
+    const int *csrColIndA,
+    const cuComplex *B,
+    int ldb,
+    const cuComplex *beta,
+    cuComplex *C,
+    int ldc,
+    void *buffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrmm3(
+    cusparseHandle_t handle,
+    cusparseOperation_t transa,
+    cusparseOperation_t transb,
+    int m,
+    int n,
+    int k,
+    int nnz,
+    const cuDoubleComplex *alpha,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrValA,
+    const int *csrRowPtrA,
+    const int *csrColIndA,
+    const cuDoubleComplex *B,
+    int ldb,
+    const cuDoubleComplex *beta,
+    cuDoubleComplex *C,
+    int ldc,
+    void *buffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseStranspose(
+    cusparseHandle_t handle,
+    cusparseOperation_t transa,
+    int m,
+    int n,
+    const float *alpha,
+    const float *A,
+    int lda,
+    float *C,
+    int ldc);
+
+cusparseStatus_t CUSPARSEAPI cusparseDtranspose(
+    cusparseHandle_t handle,
+    cusparseOperation_t transa,
+    int m,
+    int n,
+    const double *alpha,
+    const double *A,
+    int lda,
+    double *C,
+    int ldc);
+
+cusparseStatus_t CUSPARSEAPI cusparseCtranspose(
+    cusparseHandle_t handle,
+    cusparseOperation_t transa,
+    int m,
+    int n,
+    const cuComplex *alpha,
+    const cuComplex *A,
+    int lda,
+    cuComplex *C,
+    int ldc);
+
+cusparseStatus_t CUSPARSEAPI cusparseZtranspose(
+    cusparseHandle_t handle,
+    cusparseOperation_t transa,
+    int m,
+    int n,
+    const cuDoubleComplex *alpha,
+    const cuDoubleComplex *A,
+    int lda,
+    cuDoubleComplex *C,
+    int ldc);
+
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrmv_binary(
+    cusparseHandle_t handle,
+    cusparseOperation_t trans,
+    int m,
+    int n,
+    int nnz,
+    const float *alpha,
+    const cusparseMatDescr_t descra,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    const float *x,
+    const float *beta,
+    float *y);
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrmv_binary(
+    cusparseHandle_t handle,
+    cusparseOperation_t trans,
+    int m,
+    int n,
+    int nnz,
+    const double *alpha,
+    const cusparseMatDescr_t descra,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    const double *x,
+    const double *beta,
+    double *y);
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrmv_binary(
+    cusparseHandle_t handle,
+    cusparseOperation_t trans,
+    int m,
+    int n,
+    int nnz,
+    const cuComplex *alpha,
+    const cusparseMatDescr_t descra,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    const cuComplex *x,
+    const cuComplex *beta,
+    cuComplex *y);
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrmv_binary(
+    cusparseHandle_t handle,
+    cusparseOperation_t trans,
+    int m,
+    int n,
+    int nnz,
+    const cuDoubleComplex *alpha,
+    const cusparseMatDescr_t descra,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    const cuDoubleComplex *x,
+    const cuDoubleComplex *beta,
+    cuDoubleComplex *y);
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsrmmInfo(
+    csrmmInfo_t *info);
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrmmInfo(
+    csrmmInfo_t info);
+
+cusparseStatus_t CUSPARSEAPI csrmm4_analysis(
+    cusparseHandle_t handle,
+    int m, // number of rows of A
+    int k, // number of columns of A
+    int nnzA, // number of nonzeros of A
+    const cusparseMatDescr_t descrA,
+    const int *csrRowPtrA, // <int> m+1
+    const int *csrColIndA, // <int> nnzA
+    csrmmInfo_t info,
+    double *ratio // nnzB / nnzA
+    );
+
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrmm4(
+    cusparseHandle_t handle,
+    cusparseOperation_t transa,
+    cusparseOperation_t transb,
+    int m,
+    int n,
+    int k,
+    int nnz,
+    const float *alpha,
+    const cusparseMatDescr_t descrA,
+    const float *csrValA,
+    const int *csrRowPtrA,
+    const int *csrColIndA,
+    const float *B,
+    int ldb,
+    const float *beta,
+    float *C,
+    int ldc,
+    csrmmInfo_t info,
+    void *buffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrmm4(
+    cusparseHandle_t handle,
+    cusparseOperation_t transa,
+    cusparseOperation_t transb,
+    int m,
+    int n,
+    int k,
+    int nnz,
+    const double *alpha,
+    const cusparseMatDescr_t descrA,
+    const double *csrValA,
+    const int *csrRowPtrA,
+    const int *csrColIndA,
+    const double *B,
+    int ldb,
+    const double *beta,
+    double *C,
+    int ldc,
+    csrmmInfo_t info,
+    void *buffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrmm4(
+    cusparseHandle_t handle,
+    cusparseOperation_t transa,
+    cusparseOperation_t transb,
+    int m,
+    int n,
+    int k,
+    int nnz,
+    const cuComplex *alpha,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *csrValA,
+    const int *csrRowPtrA,
+    const int *csrColIndA,
+    const cuComplex *B,
+    int ldb,
+    const cuComplex *beta,
+    cuComplex *C,
+    int ldc,
+    csrmmInfo_t info,
+    void *buffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrmm4(
+    cusparseHandle_t handle,
+    cusparseOperation_t transa,
+    cusparseOperation_t transb,
+    int m,
+    int n,
+    int k,
+    int nnz,
+    const cuDoubleComplex *alpha,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrValA,
+    const int *csrRowPtrA,
+    const int *csrColIndA,
+    const cuDoubleComplex *B,
+    int ldb,
+    const cuDoubleComplex *beta,
+    cuDoubleComplex *C,
+    int ldc,
+    csrmmInfo_t info,
+    void *buffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrmm5(
+    cusparseHandle_t handle,
+    cusparseOperation_t transa,
+    cusparseOperation_t transb,
+    int m,
+    int n,
+    int k,
+    int nnzA,
+    const float *alpha,
+    const cusparseMatDescr_t descrA,
+    const float  *csrValA,
+    const int *csrRowPtrA,
+    const int *csrColIndA,
+    const float *B,
+    int ldb,
+    const float *beta,
+    float *C,
+    int ldc
+    );
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrmm5(
+    cusparseHandle_t handle,
+    cusparseOperation_t transa,
+    cusparseOperation_t transb,
+    int m,
+    int n,
+    int k,
+    int nnzA,
+    const double *alpha,
+    const cusparseMatDescr_t descrA,
+    const double  *csrValA,
+    const int *csrRowPtrA,
+    const int *csrColIndA,
+    const double *B,
+    int ldb,
+    const double *beta,
+    double *C,
+    int ldc
+    );
+
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrmm6(
+    cusparseHandle_t handle,
+    cusparseOperation_t transa,
+    cusparseOperation_t transb,
+    int m,
+    int n,
+    int k,
+    int nnzA,
+    const float *alpha,
+    const cusparseMatDescr_t descrA,
+    const float  *csrValA,
+    const int *csrRowPtrA,
+    const int *csrColIndA,
+    const float *B,
+    int ldb,
+    const float *beta,
+    float *C,
+    int ldc
+    );
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrmm6(
+    cusparseHandle_t handle,
+    cusparseOperation_t transa,
+    cusparseOperation_t transb,
+    int m,
+    int n,
+    int k,
+    int nnzA,
+    const double *alpha,
+    const cusparseMatDescr_t descrA,
+    const double  *csrValA,
+    const int *csrRowPtrA,
+    const int *csrColIndA,
+    const double *B,
+    int ldb,
+    const double *beta,
+    double *C,
+    int ldc
+    );
+
+
+
+cusparseStatus_t CUSPARSEAPI cusparseSmax(
+    cusparseHandle_t handle,
+    int n,
+    const float *x,
+    float *valueHost,
+    float *work  /* at least n+1 */
+    );
+
+cusparseStatus_t CUSPARSEAPI cusparseDmax(
+    cusparseHandle_t handle,
+    int n,
+    const double *x,
+    double *valueHost,
+    double *work  /* at least n+1 */
+    );
+
+cusparseStatus_t CUSPARSEAPI cusparseSmin(
+    cusparseHandle_t handle,
+    int n,
+    const float *x,
+    float *valueHost,
+    float *work  /* at least n+1 */
+    );
+
+cusparseStatus_t CUSPARSEAPI cusparseDmin(
+    cusparseHandle_t handle,
+    int n,
+    const double *x,
+    double *valueHost,
+    double *work  /* at least n+1 */
+    );
+
+cusparseStatus_t CUSPARSEAPI cusparseI16sort_internal_bufferSizeExt(
+    cusparseHandle_t handle,
+    int n,
+    size_t *pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI cusparseI16sort_internal(
+    cusparseHandle_t handle,
+    int num_bits, /* <= 16 */
+    int n,
+    unsigned short *key,
+    int *P,
+    int ascend,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseI32sort_internal_bufferSizeExt(
+    cusparseHandle_t handle,
+    int n,
+    size_t *pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI cusparseI32sort_internal(
+    cusparseHandle_t handle,
+    int num_bits, /* <= 32 */
+    int n,
+    unsigned int *key,
+    int *P,
+    int ascend,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseI64sort_internal_bufferSizeExt(
+    cusparseHandle_t handle,
+    int n,
+    size_t *pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI cusparseI64sort_internal(
+    cusparseHandle_t handle,
+    int num_bits, /* <= 64 */
+    int n,
+    unsigned long long *key,
+    int *P,
+    int ascend,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseIsort_bufferSizeExt(
+    cusparseHandle_t handle,
+    int n,
+    const int *key,
+    const int *P,
+    int ascend,
+    size_t *pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI cusparseIsort(
+    cusparseHandle_t handle,
+    int n,
+    int *key,
+    int *P,
+    int ascend,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseSsort_bufferSizeExt(
+    cusparseHandle_t handle,
+    int n,
+    const float *key,
+    const int *P,
+    int ascend,
+    size_t *pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI cusparseSsort(
+    cusparseHandle_t handle,
+    int n,
+    float *key,
+    int *P,
+    int ascend,
+    void *pBuffer);
+
+
+cusparseStatus_t CUSPARSEAPI cusparseDsort_bufferSizeExt(
+    cusparseHandle_t handle,
+    int n,
+    const double *key,
+    const int *P,
+    int ascend,
+    size_t *pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI cusparseDsort(
+    cusparseHandle_t handle,
+    int n,
+    double *key,
+    int *P,
+    int ascend,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseHsort_bufferSizeExt(
+    cusparseHandle_t handle,
+    int n,
+    const __half *key,
+    const int *P,
+    int ascend,
+    size_t *pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI cusparseHsort(
+    cusparseHandle_t handle,
+    int n,
+    __half *key_fp16,
+    int *P,
+    int ascend,
+    void *pBuffer);
+
+
+
+
+
+cusparseStatus_t CUSPARSEAPI cusparseHsortsign_bufferSizeExt(
+    cusparseHandle_t handle,
+    int n,
+    const __half *key,
+    const int *P,
+    int ascend,
+    size_t *pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI cusparseSsortsign_bufferSizeExt(
+    cusparseHandle_t handle,
+    int n,
+    const float *key,
+    const int *P,
+    int ascend,
+    size_t *pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI cusparseDsortsign_bufferSizeExt(
+    cusparseHandle_t handle,
+    int n,
+    const double *key,
+    const int *P,
+    int ascend,
+    size_t *pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI cusparseIsortsign_bufferSizeExt(
+    cusparseHandle_t handle,
+    int n,
+    const int *key,
+    const int *P,
+    int ascend,
+    size_t *pBufferSize);
+
+//#if defined(__cplusplus)
+cusparseStatus_t CUSPARSEAPI cusparseHsortsign(
+    cusparseHandle_t handle,
+    int n,
+    __half *key,
+    int *P,
+    int ascend,
+    int *h_nnz_bucket0, /* host */
+    void *pBuffer);
+//#endif
+
+cusparseStatus_t CUSPARSEAPI cusparseSsortsign(
+    cusparseHandle_t handle,
+    int n,
+    float *key,
+    int *P,
+    int ascend,
+    int *h_nnz_bucket0, /* host */
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseDsortsign(
+    cusparseHandle_t handle,
+    int n,
+    double *key,
+    int *P,
+    int ascend,
+    int *h_nnz_bucket0, /* host */
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseIsortsign(
+    cusparseHandle_t handle,
+    int n,
+    int *key,
+    int *P,
+    int ascend,
+    int *h_nnz_bucket0, /* host */
+    void *pBuffer);
+
+//----------------------------------------------
+
+
+cusparseStatus_t CUSPARSEAPI cusparseDDcsrMv_hyb(
+    cusparseHandle_t handle,
+    cusparseOperation_t trans,
+    int m,
+    int n,
+    int nnz,
+    const double *alpha,
+    const cusparseMatDescr_t descra,
+    const double *csrVal,
+    const int *csrRowPtr,
+    const int *csrColInd,
+    const double *x,
+    const double *beta,
+    double *y);
+
+
+/*
+ * gtsv2Batch: cuThomas algorithm
+ * gtsv3Batch: QR
+ * gtsv4Batch: LU with partial pivoting
+ */
+cusparseStatus_t CUSPARSEAPI cusparseSgtsv2Batch(
+    cusparseHandle_t handle,
+    int n,
+    float *dl,
+    float  *d,
+    float *du,
+    float *x,
+    int batchCount);
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsv2Batch(
+    cusparseHandle_t handle,
+    int n,
+    double *dl,
+    double  *d,
+    double *du,
+    double *x,
+    int batchCount);
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv2Batch(
+    cusparseHandle_t handle,
+    int n,
+    cuComplex *dl,
+    cuComplex  *d,
+    cuComplex *du,
+    cuComplex *x,
+    int batchCount);
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsv2Batch(
+    cusparseHandle_t handle,
+    int n,
+    cuDoubleComplex *dl,
+    cuDoubleComplex  *d,
+    cuDoubleComplex *du,
+    cuDoubleComplex *x,
+    int batchCount);
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsv3Batch_bufferSizeExt(
+    cusparseHandle_t handle,
+    int n,
+    const float *dl,
+    const float  *d,
+    const float *du,
+    const float *x,
+    int batchSize,
+    size_t *pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsv3Batch_bufferSizeExt(
+    cusparseHandle_t handle,
+    int n,
+    const double *dl,
+    const double  *d,
+    const double *du,
+    const double *x,
+    int batchSize,
+    size_t *pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv3Batch_bufferSizeExt(
+    cusparseHandle_t handle,
+    int n,
+    const cuComplex *dl,
+    const cuComplex  *d,
+    const cuComplex *du,
+    const cuComplex *x,
+    int batchSize,
+    size_t *pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsv3Batch_bufferSizeExt(
+    cusparseHandle_t handle,
+    int n,
+    const cuDoubleComplex *dl,
+    const cuDoubleComplex  *d,
+    const cuDoubleComplex *du,
+    const cuDoubleComplex *x,
+    int batchSize,
+    size_t *pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsv3Batch(
+    cusparseHandle_t handle,
+    int n,
+    float *dl,
+    float  *d,
+    float *du,
+    float *x,
+    int batchSize,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsv3Batch(
+    cusparseHandle_t handle,
+    int n,
+    double *dl,
+    double  *d,
+    double *du,
+    double *x,
+    int batchSize,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv3Batch(
+    cusparseHandle_t handle,
+    int n,
+    cuComplex *dl,
+    cuComplex  *d,
+    cuComplex *du,
+    cuComplex *x,
+    int batchSize,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsv3Batch(
+    cusparseHandle_t handle,
+    int n,
+    cuDoubleComplex *dl,
+    cuDoubleComplex  *d,
+    cuDoubleComplex *du,
+    cuDoubleComplex *x,
+    int batchSize,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsv4Batch_bufferSizeExt(
+    cusparseHandle_t handle,
+    int n,
+    const float *dl,
+    const float  *d,
+    const float *du,
+    const float *x,
+    int batchSize,
+    size_t *pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsv4Batch_bufferSizeExt(
+    cusparseHandle_t handle,
+    int n,
+    const double *dl,
+    const double  *d,
+    const double *du,
+    const double *x,
+    int batchSize,
+    size_t *pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv4Batch_bufferSizeExt(
+    cusparseHandle_t handle,
+    int n,
+    const cuComplex *dl,
+    const cuComplex  *d,
+    const cuComplex *du,
+    const cuComplex *x,
+    int batchSize,
+    size_t *pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsv4Batch_bufferSizeExt(
+    cusparseHandle_t handle,
+    int n,
+    const cuDoubleComplex *dl,
+    const cuDoubleComplex  *d,
+    const cuDoubleComplex *du,
+    const cuDoubleComplex *x,
+    int batchSize,
+    size_t *pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsv4Batch(
+    cusparseHandle_t handle,
+    int n,
+    float *dl,
+    float  *d,
+    float *du,
+    float *x,
+    int batchSize,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsv4Batch(
+    cusparseHandle_t handle,
+    int n,
+    double *dl,
+    double  *d,
+    double *du,
+    double *x,
+    int batchSize,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv4Batch(
+    cusparseHandle_t handle,
+    int n,
+    cuComplex *dl,
+    cuComplex  *d,
+    cuComplex *du,
+    cuComplex *x,
+    int batchSize,
+    void *pBuffer);
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsv4Batch(
+    cusparseHandle_t handle,
+    int n,
+    cuDoubleComplex *dl,
+    cuDoubleComplex  *d,
+    cuDoubleComplex *du,
+    cuDoubleComplex *x,
+    int batchSize,
+    void *pBuffer);
+
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+
+#endif /* CUSPARSE_INTERNAL_H_ */
+
diff --git a/cpp/nvgraph/test/Makefile b/cpp/nvgraph/test/Makefile
new file mode 100644
index 00000000000..f2617d7fc85
--- /dev/null
+++ b/cpp/nvgraph/test/Makefile
@@ -0,0 +1,33 @@
+# Makefile for building NVCompute/CUDA BLAS library
+SOLNDIR  := ../.
+
+# Get the profile settings
+ifdef VULCAN
+include $(VULCAN_TOOLKIT_BASE)/build/getprofile.mk
+include $(VULCAN_TOOLKIT_BASE)/build/config/$(PROFILE).mk
+include $(VULCAN_TOOLKIT_BASE)/build/config/DetectOS.mk
+else
+include ../../build/getprofile.mk
+include ../../build/config/$(PROFILE).mk
+include ../../build/config/DetectOS.mk
+endif
+
+export I_AM_SLOPPY = 1
+AGNOSTIC_PROJECTS += nvgraph_test 
+AGNOSTIC_PROJECTS += nvgraph_capi_tests 
+AGNOSTIC_PROJECTS += nvgraph_capi_tests_subgraph
+AGNOSTIC_PROJECTS += nvgraph_capi_tests_conversion
+AGNOSTIC_PROJECTS += nvgraph_benchmark
+AGNOSTIC_PROJECTS += nvgraph_capi_tests_clustering
+AGNOSTIC_PROJECTS += nvgraph_capi_tests_contraction
+AGNOSTIC_PROJECTS += nvgraph_capi_tests_traversal
+AGNOSTIC_PROJECTS += nvgraph_capi_tests_triangles
+AGNOSTIC_PROJECTS += nvgraph_2d_partitioning_test
+AGNOSTIC_PROJECTS += nvgraph_capi_tests_2d_bfs
+AGNOSTIC_PROJECTS += nvgraph_capi_tests_2d_bfs_net
+
+ifdef VULCAN
+include $(VULCAN_TOOLKIT_BASE)/build/common.mk
+else
+include ../../build/common.mk
+endif
diff --git a/cpp/nvgraph/test/data_gen.sh b/cpp/nvgraph/test/data_gen.sh
new file mode 100755
index 00000000000..a911b688f69
--- /dev/null
+++ b/cpp/nvgraph/test/data_gen.sh
@@ -0,0 +1,58 @@
+#!/bin/sh
+#Usage sh data_gen size1 size2 ...
+#Generate power law in-degree plus rmat graphs of size size1 ... sizeN
+#Corresponding transposed and binary csr are generated as well
+
+convert (){
+edges=$1
+#echo "Starting Sort on $edges..."
+./generators/convertors/sort $edges
+#echo "Done"
+
+tmp="_s"
+sedges=$edges$tmp
+echo "Starting H on $sedges ..."
+./generators/convertors/H $sedges
+#echo "Done"
+
+tmp="_mtx"
+matrix=$sedges$tmp
+#delete soted edges
+rm $sedges
+
+echo "Starting HTa on $matrix ..."
+./generators/convertors/HTA $matrix
+
+tmp="_T"
+outp=$edges$tmp
+outpp=$matrix$tmp
+mv $outpp $outp
+#delete H
+rm $matrix
+
+#echo "Starting binary conversion ..."
+./generators/convertors/mtob $outp
+#echo "Generated transposed coo and transposed csr bin"
+}
+
+echo "Building the tools ..."
+make -C generators
+make -C generators/convertors
+#generate the graphs we need here
+#loop over script arguments which represent graph sizes.
+for var in "$@"
+do
+echo "Generate graphs of size $var"
+vertices=$var
+option="i"
+./generators/plodg $vertices $option
+./generators/rmatg $vertices $option
+graph="plod_graph_"
+format=".mtx"
+path_to_data="local_test_data/"
+name="$path_to_data$graph$vertices$format"
+convert $name
+graph="rmat_graph_"
+name="$path_to_data$graph$vertices$format"
+convert $name
+done
diff --git a/cpp/nvgraph/test/generators/Makefile b/cpp/nvgraph/test/generators/Makefile
new file mode 100644
index 00000000000..3d9c9ddc808
--- /dev/null
+++ b/cpp/nvgraph/test/generators/Makefile
@@ -0,0 +1,24 @@
+# Copyright (c) 2015, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+CXX=g++
+CXXFLAGS=-Wall -Ofast -march=native -pipe
+
+all: print_info plodg rmatg
+
+plodg: plod.cpp
+	$(CXX) $(CXXFLAGS) $< -o $@
+
+rmatg: rmat.cpp
+	$(CXX) $(CXXFLAGS) $< -o $@
+
+clean:
+	rm -f rmatg plodg
+
+print_info:
+	$(info The Boost Graph Library is required)
diff --git a/cpp/nvgraph/test/generators/convertors/H_to_HtSorted_and_a.cpp b/cpp/nvgraph/test/generators/convertors/H_to_HtSorted_and_a.cpp
new file mode 100644
index 00000000000..96f7b36e5a0
--- /dev/null
+++ b/cpp/nvgraph/test/generators/convertors/H_to_HtSorted_and_a.cpp
@@ -0,0 +1,135 @@
+#include <stdio.h>
+#include <iostream>
+#include <stdlib.h>
+#include <algorithm>    // std::sort
+#include <vector>       // std::vector
+// This code transpose a matrix H and compute the flag vector of empty rows a.
+// We assume that H is row-substochastic, in MatrixMarket format and data are sorted by row id
+// The output is filename_T.filetype, H is printed first then a is printed.
+
+struct elt {
+  long int r;
+  long int c;
+  double v;
+};
+
+void printUsageAndExit()
+{
+  printf("%s", "Fatal Error\n");
+  printf("%s", "Usage: ./HTA H.mtx\n");
+  printf("%s", "NOTE1: H is the row-substochastic matrix of a graph\n");
+  printf("%s", "NOTE2: H is in MatrixMarket coordinate real general format\n");
+  printf("%s", "NOTE3: Data are sorted by row id\n");
+  printf("%s", "Output : H^t and the bookmark vector of empty rows\n");
+  printf("%s", "***This output fits the input of AMGX PageRank***\n");
+  exit(0);
+}
+
+inline bool operator< (const elt& a, const elt& b)
+{ // ordered by row and then by colum inside a row
+  return a.r<b.r || (a.r==b.r && a.c<b.c ) ;  
+}
+
+int main (int argc, char *argv[])
+{
+  // Check args
+  if (argc == 1) printUsageAndExit();
+  
+  // Vars
+  long int n, nz, start, i = 0 ,j, k, lastr;
+  double v;
+  char outp[128], cc;
+  FILE *fpin = NULL, *fpout = NULL;
+  elt e;
+  std::vector<struct elt> A;
+  std::vector<unsigned int> a;
+  // Get I/O names
+  // The output is filename_T
+  while (argv[1][i] != '\0')
+  {outp[i] = argv[1][i];i++;}
+  outp[i] = '_'; i++;
+  outp[i] = 'T';i++;
+  outp[i]='\0';
+  // Open files
+  fpin = fopen(argv[1],"r");
+  fpout = fopen(outp,"w");
+  if (!fpin || !fpout)
+  {
+    printf("%s", "Fatal Error : I/O fail\n");
+    exit(0);
+  }
+  
+  // Skip lines starting with "%%""
+  do
+  {
+    cc = fgetc(fpin); 
+    if (cc == '%') fgets(outp,128,fpin);
+  }
+  while (cc == '%');
+  fseek( fpin, -1, SEEK_CUR );
+
+  // Get n and nz
+  fscanf(fpin,"%ld",&n);
+  fscanf(fpin,"%ld",&n);
+  fscanf(fpin,"%ld",&nz);
+
+  // Print format and size
+  fprintf(fpout, "%s", "%%");
+  fprintf(fpout,"MatrixMarket matrix coordinate real general\n");
+  fprintf(fpout, "%s", "%%");
+  fprintf(fpout,"AMGX rhs\n");
+  fprintf(fpout,"%ld %ld %ld\n",n, n, nz);
+
+  // Empty rows at the begining
+  fscanf(fpin,"%ld",&e.c);
+  fscanf(fpin,"%ld",&e.r);
+  fscanf(fpin,"%lf",&e.v);
+  A.push_back(e);
+
+  for (j=0; j<static_cast<int>(e.c)-1; j++)
+  {
+    std::cout<<e.c<<' '<<e.r<<' '<<e.v<<'\n';
+    a.push_back(1);
+  }
+
+    // Loop
+  for (i=0; i< nz-1;i++)
+  {
+    lastr = e.c;
+    fscanf(fpin,"%ld",&e.c);
+    fscanf(fpin,"%ld",&e.r);
+    fscanf(fpin,"%lf",&e.v);
+    A.push_back(e);
+
+    if (e.c > lastr)
+    {
+      if (e.c > lastr+1)
+      {
+        a.push_back(0); 
+        //Successive empty rows 
+        for (k=0; k<static_cast<int>(e.c)-lastr-1; k++)
+          a.push_back(1);
+      }
+      else
+        a.push_back(0);
+    }
+  }
+  a.push_back(0);
+
+  // Empty rows at the end
+  for (k=a.size(); k<n; k++)
+  {
+    a.push_back(1);
+  }
+
+  std::sort (A.begin(), A.end());
+  for (std::vector<struct elt>::iterator it = A.begin() ; it != A.end(); ++it)
+    fprintf(fpout,"%ld %ld %.9f\n",it->r, it->c, it->v);
+
+  for (std::vector<unsigned int>::iterator it = a.begin() ; it != a.end(); ++it)
+    fprintf(fpout,"%u\n",*it);
+
+  return 0;
+
+}
+
diff --git a/cpp/nvgraph/test/generators/convertors/Makefile b/cpp/nvgraph/test/generators/convertors/Makefile
new file mode 100644
index 00000000000..e4f9921479d
--- /dev/null
+++ b/cpp/nvgraph/test/generators/convertors/Makefile
@@ -0,0 +1,21 @@
+CC=g++
+CFLAGS=-O3 -march=native -pipe -w
+LDFLAGS=-lm
+
+all: sort HTA H mtob
+
+sort: sort_eges.cpp
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ 
+
+HTA: H_to_HtSorted_and_a.cpp
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ 
+ 
+H: edges_to_H.cpp
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ 
+
+mtob: binary_converter.cpp
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ 
+
+clean: 
+	rm sort HTA mtob
+
diff --git a/cpp/nvgraph/test/generators/convertors/README.txt b/cpp/nvgraph/test/generators/convertors/README.txt
new file mode 100644
index 00000000000..6625767071a
--- /dev/null
+++ b/cpp/nvgraph/test/generators/convertors/README.txt
@@ -0,0 +1,62 @@
+-----------------------
+Compile
+-----------------------
+> make
+
+-----------------------
+Run 
+-----------------------
+
+
+To preprocess a set of edges in matrix market patern format
+> ./pprocess.sh edges.dat
+
+
+
+You can run separately
+Sort :
+> ./sort edges.dat
+
+Compute H :
+> ./H edges.dat
+
+Compute H transposed and dangling node vector
+> ./HTA H.mtx
+
+Convert in AmgX binary format
+> ./mtob HTA.mtx
+
+-----------------------
+Input
+-----------------------
+The format for sort and H is matrix market patern format
+example :
+
+%%comment
+% as much comments as you want
+%...
+size size nonzero
+a b
+c d
+a e
+e a
+.
+.
+.
+[a-e] are in N*
+
+
+The format for HTA and mtob is matrix market coordinate format
+%%comment
+% as much comments as you want
+%...
+size size nonzero
+a b f
+c d g
+a e h
+e a i
+.
+.
+.
+[a-e] are in N*
+[f-i] are in R
\ No newline at end of file
diff --git a/cpp/nvgraph/test/generators/convertors/binary_converter.cpp b/cpp/nvgraph/test/generators/convertors/binary_converter.cpp
new file mode 100644
index 00000000000..e4ecd7ba8de
--- /dev/null
+++ b/cpp/nvgraph/test/generators/convertors/binary_converter.cpp
@@ -0,0 +1,205 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>     
+#include <string.h>
+#include <algorithm>    // std::sort etc.
+#include <vector>       // std::vector
+#include <iostream>     // std::cout
+
+typedef int idx_t;
+typedef double val_t;
+
+void printUsageAndExit()
+{
+  printf("%s", "Usage:./mtob M.mtx\n");
+  printf("%s", "NOTE1: M is square, in MatrixMarket coordinate real general format\n");
+  printf("%s", "NOTE2: Data are sorted by row id\n");
+
+  exit(0);
+}
+
+void print_csr(	std::vector<idx_t> &row_ptrs, std::vector<idx_t> &col_indices, std::vector<val_t> &val)
+{
+	for (std::vector<idx_t>::iterator it = row_ptrs.begin(); it != row_ptrs.end(); ++it)
+    	std::cout << ' ' << *it;
+  	std::cout << '\n';
+  	for (std::vector<idx_t>::iterator it = col_indices.begin(); it != col_indices.end(); ++it)
+    	std::cout << ' ' << *it;
+  	std::cout << '\n';
+  	for (std::vector<val_t>::iterator it = val.begin(); it != val.end(); ++it)
+    	std::cout << ' ' << *it;
+  	std::cout << '\n';
+}
+
+// Generates csr from matrix market format
+void read_csr(	FILE *fpin,
+				idx_t n, 
+				idx_t nz,
+				std::vector<idx_t> &row_weight,
+				std::vector<idx_t> &row_ptrs,
+				std::vector<idx_t> &col_indices,
+				std::vector<val_t> &val)
+{
+	idx_t weight=0, i=0 ,j=0, k=0, lastr=0, r=0, c=0;
+	double v;
+	// Empty rows at the begining
+	fscanf(fpin,"%d",&r);
+	fscanf(fpin,"%d",&c);
+	fscanf(fpin,"%lf",&v);
+	row_ptrs.push_back(0);
+	col_indices.push_back(c-1);
+	val.push_back(v);
+	weight++;
+
+	for (j=0; j<r-1; j++)
+	{
+		row_ptrs.push_back(0);
+		row_weight.push_back(0);
+	}
+	
+  	// Loop
+  	for (i=1; i< nz;i++)
+	{
+		lastr = r;
+		fscanf(fpin,"%d",&r);
+		fscanf(fpin,"%d",&c);
+		fscanf(fpin,"%lf",&v);
+		col_indices.push_back(c-1);
+		val.push_back(v);
+		
+		if (lastr == r)
+			weight++;
+		else if (lastr < r)// new row
+		{
+			row_ptrs.push_back(row_ptrs.back()+weight);
+			row_weight.push_back(weight);
+			//Successive empty rows	
+			for (k=row_weight.size(); k<r-1; k++)
+			{
+				row_ptrs.push_back(row_ptrs.back());
+				row_weight.push_back(0);
+			}
+			weight = 1;
+		}
+		else
+		{
+			printf("%s", "Fatal Error : Data have to be sorted by row id\n");
+  			exit(0);
+		}
+	}
+
+	row_ptrs.push_back(row_ptrs.back()+weight);
+	row_weight.push_back(weight);	
+
+	// Empty rows at the end
+	for (k=row_weight.size(); k<n; k++)
+	{
+		row_ptrs.push_back(row_ptrs.back());
+		row_weight.push_back(0);
+	}
+}
+void read_vector_mtx( 	FILE *fpin,
+						idx_t n, 
+						std::vector<val_t> &a)
+{
+	val_t v;
+	for (idx_t i=0; i< n;i++)
+	{
+		fscanf(fpin,"%lf",&v);
+		a.push_back(v);
+	}
+}
+void write_csr_bin (char *argv[], 
+					idx_t n, 
+					idx_t nz, 
+					std::vector<idx_t> &row_weight,
+					std::vector<idx_t> &row_ptrs,
+					std::vector<idx_t> &col_indices,
+					std::vector<val_t> &val,
+					std::vector<val_t> &a
+					)
+{
+	idx_t i;
+	char outp [128];
+	// Generate output name
+  	while (argv[1][i] != '\0')
+  	{
+  		outp[i] = argv[1][i];
+  		i++;
+  	}
+  	outp[i] = '_';i++; 
+  	outp[i] = 'b';i++;
+  	outp[i] = 'i';i++;
+  	outp[i] = 'n';i++;
+ 	outp[i]='\0';
+ 	FILE *fpout = NULL;
+	
+    fpout = fopen(outp,"w");
+    if (!fpout)
+  	{
+  		printf("%s", "Fatal Error : I/O fail\n");
+  		exit(0);
+  	}
+	const char header [] = "%%NVAMGBinary\n";
+	const int system_header_size = 9;
+	uint32_t system_flags [] = { 1, 1, 0, 0, 0, 1, 1, n, nz };
+    fwrite(header, sizeof(char), strlen(header), fpout);
+    fwrite(system_flags, sizeof(uint32_t), system_header_size, fpout);
+    fwrite(&row_ptrs[0], sizeof(idx_t), row_ptrs.size(), fpout);
+    fwrite(&col_indices[0], sizeof(idx_t), col_indices.size(), fpout);
+    fwrite(&val[0], sizeof(val_t), val.size(), fpout);
+    fwrite(&a[0], sizeof(val_t), a.size(), fpout);
+    fclose(fpout);
+}
+int main (int argc, char **argv)
+{
+  	// Vars
+  	idx_t i = 0;
+  	idx_t n=0, m=0, nz=0, nparts=0, sym=0;
+	char dum[128], cc;
+	FILE *fpin = NULL;
+	std::vector<idx_t> row_ptrs, col_indices, row_weight;
+	std::vector<val_t> a , val;
+
+	// Check args
+
+  	if (argc != 2) printUsageAndExit();
+  	
+  	// Open file
+	fpin = fopen(argv[1],"r");
+  	if (!fpin)
+  	{
+  		printf("%s", "Fatal Error : I/O fail\n");
+  		exit(0);
+  	}
+  	
+  	// Skip lines starting with "%%""
+  	do
+  	{
+  		cc = fgetc(fpin); 
+  		if (cc == '%') fgets(dum,128,fpin);
+  	}
+  	while (cc == '%');
+  	fseek( fpin, -1, SEEK_CUR );
+
+  	// Get n and nz
+  	fscanf(fpin,"%ld",&n);
+  	fscanf(fpin,"%ld",&m);
+  	fscanf(fpin,"%ld",&nz);
+  	if (n != m)
+  	{
+  		printf("%s", "Fatal Error : The matrix is not square\n");
+  		exit(0);
+  	}
+
+	//printf("Reading...\n");
+    read_csr(fpin, n, nz, row_weight, row_ptrs, col_indices, val);  
+    read_vector_mtx(fpin, n, a);  
+  	
+  	//printf("Writing...\n");
+    write_csr_bin(argv, n, nz, row_weight, row_ptrs, col_indices, val,a);
+  
+    //printf("Success!\n");
+	return 0;
+}
+
diff --git a/cpp/nvgraph/test/generators/convertors/edges_to_H.cpp b/cpp/nvgraph/test/generators/convertors/edges_to_H.cpp
new file mode 100644
index 00000000000..f77baea628a
--- /dev/null
+++ b/cpp/nvgraph/test/generators/convertors/edges_to_H.cpp
@@ -0,0 +1,100 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <vector>
+
+struct edge {
+  unsigned long int r;
+  unsigned long int c;
+};
+
+void printUsageAndExit()
+{
+  printf("%s", "Fatal Error\n");
+  printf("%s", "Usage: ./H edges.dat\n");
+  printf("%s", "Input : Graph given as a sorted set of edges\n");
+  printf("%s", "Output : Row sub-stochastic matrix in MatrixMarket format\n");
+  exit(0);
+}
+
+int main (int argc, char *argv[])
+{
+	// Check args
+  	if (argc != 2) printUsageAndExit();
+  	
+  	// Vars
+  	unsigned long int n, nz, i = 0, current_r, nbr = 1;
+  	int ok;
+	double scal;
+	char outp[128], cc;
+	FILE *fpin = NULL, *fpout = NULL;
+	edge e;
+  	std::vector<struct edge> row;
+  	// Get I/O names
+  	// The output is filename.mtx
+  	  while (argv[1][i] != '\0')
+    {outp[i] = argv[1][i];i++;}
+	  outp[i] = '_'; i++;
+	  outp[i] = 'm';i++;outp[i] = 't';i++;outp[i] = 'x';i++;
+	  outp[i]='\0';
+  	
+  	// Open files
+	fpin = fopen(argv[1],"r");
+	fpout = fopen(outp,"w");
+  	if (!fpin || !fpout)
+  	{
+  		printf("%s", "Fatal Error : I/O fail\n");
+  		exit(0);
+  	}
+  	
+  	// Get n and nz
+  	fscanf(fpin,"%lu",&n);
+  	fscanf(fpin,"%lu",&n);
+  	fscanf(fpin,"%lu",&nz);
+
+	fprintf(fpout, "%s", "%%" );
+	fprintf(fpout,"MatrixMarket matrix coordinate real general\n");
+	fprintf(fpout,"%lu %lu %lu\n",n, n, nz);
+	
+	// Read the first edge
+	ok = fscanf(fpin,"%lu",&e.r);
+	if (ok)
+	{
+		fscanf(fpin,"%lu",&e.c);
+		current_r = e.r;
+		row.push_back(e);
+	}
+	else
+	{
+		printf("%s", "Fatal Error : Wrong data format\n");
+  		exit(0);
+	}
+	
+	//Loop
+	for (i=0; i<nz-1; i++)
+	{	
+		fscanf(fpin,"%lu",&e.r);
+		fscanf(fpin,"%lu",&e.c);
+		if (current_r == e.r)
+		{
+			nbr++;
+		}
+		else
+		{
+			current_r = e.r;
+			scal = 1.0/nbr;
+			for (std::vector<struct edge>::iterator it = row.begin() ; it != row.end(); ++it)
+				fprintf(fpout,"%lu %lu %.9lf\n",it->r, it->c, scal);
+			row.clear();
+			nbr = 1;
+		}
+		row.push_back(e);
+	}
+	// Last print
+	scal = 1.0/nbr;
+	for (std::vector<struct edge>::iterator it = row.begin() ; it != row.end(); ++it)
+		fprintf(fpout,"%lu %lu %.9f\n",it->r, it->c, scal);
+
+	return 0;
+}
+
diff --git a/cpp/nvgraph/test/generators/convertors/pprocess.sh b/cpp/nvgraph/test/generators/convertors/pprocess.sh
new file mode 100755
index 00000000000..37d763ce35e
--- /dev/null
+++ b/cpp/nvgraph/test/generators/convertors/pprocess.sh
@@ -0,0 +1,32 @@
+#!/bin/sh
+
+edges="$1"
+echo "Starting Sort on $edges..."
+./sort $edges
+echo "Done"
+
+tmp="_s"
+sedges=$edges$tmp
+echo "Starting H on $sedges ..."
+./H $sedges
+echo "Done"
+
+tmp="_mtx"
+matrix=$sedges$tmp
+#delete soted edges
+rm $sedges
+
+echo "Starting HTa on $matrix ..."
+./HTA $matrix
+
+tmp="_T"
+outp=$edges$tmp
+outpp=$matrix$tmp
+mv $outpp $outp
+#delete H
+rm $matrix
+
+echo "Starting binary conversion ..."
+./mtob $outp
+echo "Done"
+
diff --git a/cpp/nvgraph/test/generators/convertors/sort_eges.cpp b/cpp/nvgraph/test/generators/convertors/sort_eges.cpp
new file mode 100644
index 00000000000..907e6fdebfb
--- /dev/null
+++ b/cpp/nvgraph/test/generators/convertors/sort_eges.cpp
@@ -0,0 +1,92 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <algorithm>    // std::sort
+#include <vector>       // std::vector
+
+struct edge {
+  unsigned long int r;
+  unsigned long int c;
+};
+
+void printUsageAndExit()
+{
+  printf("%s", "Fatal Error\n");
+  printf("%s", "Usage: ./sort edges.dat\n");
+  printf("%s", "Input : Graph in matrix market parttern format");
+  printf("%s", "Output : Graph with sorted edges in matrix market parttern format\n");
+  exit(0);
+}
+
+inline bool operator< (const edge& a, const edge& b){ if(a.r<b.r) return true; else return false; }
+
+int main (int argc, char *argv[])
+{
+  // Check args
+  if (argc != 2) printUsageAndExit();
+    	
+  // Vars
+  unsigned long int n, nz, i = 0, current_r, nbr = 1;
+  int ok;
+  double scal;
+  char outp[128], cc;
+  FILE *fpin = NULL, *fpout = NULL;
+  edge e;
+  std::vector<struct edge> edges;
+
+  // Get I/O names
+  // The output is filename.mtx
+  while (argv[1][i] != '\0')
+    {outp[i] = argv[1][i];i++;}
+  outp[i] = '_'; i++;
+  outp[i] = 's';i++;
+  outp[i]='\0';
+  	
+  	// Open files
+	fpin = fopen(argv[1],"r");
+	fpout = fopen(outp,"w");
+	if (!fpin || !fpout)
+	{
+		printf("%s", "Fatal Error : I/O fail\n");
+		exit(0);
+	}
+
+	// Skip lines starting with "%""
+	do
+	{
+		cc = fgetc(fpin); 
+		if (cc == '%') fgets(outp,128,fpin);
+	}
+	while (cc == '%');
+	fseek( fpin, -1, SEEK_CUR );
+
+	// Get n and nz
+	fscanf(fpin,"%lu",&n);
+	//fscanf(fpin,"%lu",&n);
+	fscanf(fpin,"%lu",&nz);
+  	fprintf(fpout,"%lu %lu %lu\n",n, n, nz);
+	// Read the first edge
+	ok = fscanf(fpin,"%lu",&e.r);
+	if (ok)
+	{
+		fscanf(fpin,"%lu",&e.c);
+		edges.push_back(e);
+	}
+	else
+	{
+		printf("%s", "Fatal Error : Wrong data format\n");
+  		exit(0);
+	}
+	
+	//Loop
+	for (i=0; i<nz-1; i++)
+	{	
+		fscanf(fpin,"%lu",&e.r);
+		fscanf(fpin,"%lu",&e.c);
+		edges.push_back(e);
+	}
+  std::sort (edges.begin(), edges.end());
+  for (std::vector<struct edge>::iterator it = edges.begin() ; it != edges.end(); ++it)
+      fprintf(fpout,"%lu %lu\n",it->r, it->c);
+	return 0;
+}
+
diff --git a/cpp/nvgraph/test/generators/plod.cpp b/cpp/nvgraph/test/generators/plod.cpp
new file mode 100644
index 00000000000..dab6528cc3c
--- /dev/null
+++ b/cpp/nvgraph/test/generators/plod.cpp
@@ -0,0 +1,89 @@
+#include <fstream>
+#include <assert.h> 
+#include <stdlib.h>
+
+#include <boost/graph/adjacency_list.hpp>
+#include <boost/graph/plod_generator.hpp>
+#include <boost/random/linear_congruential.hpp>
+#include <boost/graph/graph_traits.hpp>
+
+
+void printUsageAndExit()
+{
+  printf("%s", "Usage:./plodg x\n");
+  printf("%s", "x is the size of the graph\n");
+  exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+  
+  /* " The Power Law Out Degree (PLOD) algorithm generates a scale-free graph from three parameters, n, alpha, and beta.
+  [...] The value of beta controls the y-intercept of the curve, so that increasing beta increases the average degree of vertices (credit = beta*x^-alpha). 
+  [...] The value of alpha controls how steeply the curve drops off, with larger values indicating a steeper curve. */
+  // From Boost documentation http://www.boost.org/doc/libs/1_47_0/libs/graph/doc/plod_generator.html
+  
+  // we use setS aka std::set for edges storage
+  // so we have at most one edges between 2 vertices
+  // the extra cost is O(log(E/V)).
+  typedef boost::adjacency_list<boost::setS> Graph;
+  typedef boost::plod_iterator<boost::minstd_rand, Graph> SFGen;
+
+  if (argc < 2) printUsageAndExit();
+  int size = atoi (argv[1]);
+  assert (size > 1 && size < INT_MAX);
+  double alpha = 2.57; // It is known that web graphs have alpha ~ 2.72.
+  double beta = size*512+1024; // This will give an average degree ~ 15
+
+  // generation
+  std::cout << "generating ... "<<'\n';
+  boost::minstd_rand gen;
+  Graph g(SFGen(gen, size, alpha, beta, false), SFGen(), size);
+  boost::graph_traits<Graph>::edge_iterator edge, edge_end;
+  
+  std::cout << "vertices : "      << num_vertices(g) <<'\n';
+  std::cout << "edges : "         << num_edges(g) <<'\n';
+  std::cout << "average degree : "<< static_cast<float>(num_edges(g))/num_vertices(g)<< '\n';
+  // Print in matrix coordinate real general format
+  std::cout << "writing ... "<<'\n';
+  std::stringstream tmp;
+  tmp <<"local_test_data/plod_graph_" << size << ".mtx";
+  const std::string filename = tmp.str();
+  std::ofstream fout(tmp.str().c_str()) ;
+  
+  if (argv[2]==NULL)
+  {
+    // Power law out degree with random weights
+    fout << "%%MatrixMarket matrix coordinate real general\n";
+    fout << num_vertices(g) <<' '<< num_vertices(g)  <<' '<< num_edges(g) << '\n';
+    float val;
+    for( boost::tie(edge, edge_end) = boost::edges(g); edge != edge_end; ++edge)
+    {
+      val = (rand()%10)+(rand()%100)*(1e-2f);
+      fout << boost::source(*edge, g) << ' ' << boost::target(*edge, g)<< ' ' << val << '\n';
+    }
+  }
+  else if (argv[2][0]=='i')
+  {
+    // Power law in degree (ie the transpose will have a power law)
+    // -- Edges only --
+    // * Wraning * edges will be unsorted, use sort_edges.cpp to sort the dataset.
+    fout << num_vertices(g) <<' '<< num_edges(g) << '\n';
+    for( boost::tie(edge, edge_end) = boost::edges(g); edge != edge_end; ++edge)
+      fout <<boost::target(*edge, g)<< ' ' << boost::source(*edge, g) << '\n';
+  }
+  else if (argv[2][0]=='o')
+  {
+    // Power law out degree
+    // -- Edges only --
+    fout << num_vertices(g) <<' '<< num_edges(g) << '\n';
+    for( boost::tie(edge, edge_end) = boost::edges(g); edge != edge_end; ++edge)
+      fout << boost::source(*edge, g) << ' ' << boost::target(*edge, g)<< '\n';
+  }
+  else printUsageAndExit();
+
+  fout.close();
+  std::cout << "done!"<<'\n';
+  return 0;
+}
+
diff --git a/cpp/nvgraph/test/generators/rmat.cpp b/cpp/nvgraph/test/generators/rmat.cpp
new file mode 100644
index 00000000000..76aa9a2ee9f
--- /dev/null
+++ b/cpp/nvgraph/test/generators/rmat.cpp
@@ -0,0 +1,89 @@
+#include <fstream>
+#include <assert.h> 
+#include <stdlib.h>
+
+#include <boost/graph/adjacency_list.hpp>
+#include <boost/graph/rmat_graph_generator.hpp>
+#include <boost/random/linear_congruential.hpp>
+#include <boost/graph/graph_traits.hpp>
+
+
+void printUsageAndExit()
+{
+  printf("%s", "Usage:./rmatg x\n");
+  printf("%s", "x is the size of the graph, x>32 (Boost generator hang if x<32)\n");
+  exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+  
+  // RMAT paper http://snap.stanford.edu/class/cs224w-readings/chakrabarti04rmat.pdf
+  // Boost doc on RMAT http://www.boost.org/doc/libs/1_49_0/libs/graph_parallel/doc/html/rmat_generator.html
+  
+  typedef boost::adjacency_list<boost::mapS, boost::vecS, boost::directedS> Graph;
+  typedef boost::unique_rmat_iterator<boost::minstd_rand, Graph> RMATGen;
+
+  if (argc < 2) printUsageAndExit();
+  int size = atoi (argv[1]);
+  if (size<32) printUsageAndExit();
+  assert (size > 31 && size < INT_MAX);
+  const unsigned num_edges = 16 * size;
+  /************************
+   * RMAT Gen
+   ************************/
+  std::cout << "generating ... "<<'\n';
+  // values of a,b,c,d are from the graph500.
+  boost::minstd_rand gen;
+  Graph g(RMATGen(gen, size, num_edges, 0.57, 0.19, 0.19, 0.05, true), RMATGen(), size);
+  assert (num_edges == boost::num_edges(g));
+  
+  /************************
+   * Print
+   ************************/
+  boost::graph_traits<Graph>::edge_iterator edge, edge_end;
+  std::cout << "vertices : "      << boost::num_vertices(g) <<'\n';
+  std::cout << "edges : "         << boost::num_edges(g) <<'\n';
+  std::cout << "average degree : "<< static_cast<float>(boost::num_edges(g))/boost::num_vertices(g)<< '\n';
+  
+  // Print in matrix coordinate real general format
+  std::cout << "writing ... "<<'\n';
+  std::stringstream tmp;
+  tmp <<"local_test_data/rmat_graph_" << size << ".mtx";
+  const std::string filename = tmp.str();
+  std::ofstream fout(tmp.str().c_str()) ;
+  if (argv[2]==NULL)
+  {
+    // Power law out degree with random weights
+    fout << "%%MatrixMarket matrix coordinate real general\n";
+    fout << boost::num_vertices(g) <<' '<< boost::num_vertices(g)  <<' '<< boost::num_edges(g) << '\n';
+    float val;
+    for( boost::tie(edge, edge_end) = boost::edges(g); edge != edge_end; ++edge)
+    {
+      val = (rand()%10)+(rand()%100)*(1e-2f);
+      fout << boost::source(*edge, g) << ' ' << boost::target(*edge, g)<< ' ' << val << '\n';
+    }
+  }
+  else if (argv[2][0]=='i')
+  {
+    // Power law in degree (ie the transpose will have a power law)
+    // -- Edges only --
+    // * Wraning * edges will be unsorted, use sort_edges.cpp to sort the dataset.
+    fout << boost::num_vertices(g) <<' '<< boost::num_edges(g) << '\n';
+    for( boost::tie(edge, edge_end) = boost::edges(g); edge != edge_end; ++edge)
+      fout <<boost::target(*edge, g)<< ' ' << boost::source(*edge, g) << '\n';
+  }
+  else if (argv[2][0]=='o')
+  {
+    // Power law out degree
+    // -- Edges only --
+    fout << boost::num_vertices(g) <<' '<< boost::num_edges(g) << '\n';
+    for( boost::tie(edge, edge_end) = boost::edges(g); edge != edge_end; ++edge)
+      fout << boost::source(*edge, g) << ' ' << boost::target(*edge, g)<< '\n';
+  }
+  else printUsageAndExit();
+  fout.close();
+  std::cout << "done"<<'\n';
+  return 0;
+}
+
diff --git a/cpp/nvgraph/test/local_test_data/small.PNG b/cpp/nvgraph/test/local_test_data/small.PNG
new file mode 100644
index 00000000000..a29e608358d
Binary files /dev/null and b/cpp/nvgraph/test/local_test_data/small.PNG differ
diff --git a/cpp/nvgraph/test/local_test_data/small.mtx b/cpp/nvgraph/test/local_test_data/small.mtx
new file mode 100644
index 00000000000..c7019a24407
--- /dev/null
+++ b/cpp/nvgraph/test/local_test_data/small.mtx
@@ -0,0 +1,12 @@
+%%MatrixMarket matrix coordinate real general
+6 6 10
+1 2 0.500000
+1 3 0.500000
+3 1 0.333333
+3 2 0.333333
+3 5 0.333333
+4 5 0.500000
+4 6 0.500000
+5 4 0.500000
+5 6 0.500000
+6 4 1.000000
diff --git a/cpp/nvgraph/test/local_test_data/small_T.bin b/cpp/nvgraph/test/local_test_data/small_T.bin
new file mode 100644
index 00000000000..cd9adbfa21c
Binary files /dev/null and b/cpp/nvgraph/test/local_test_data/small_T.bin differ
diff --git a/cpp/nvgraph/test/local_test_data/small_T.mtx b/cpp/nvgraph/test/local_test_data/small_T.mtx
new file mode 100644
index 00000000000..4276f0030e2
--- /dev/null
+++ b/cpp/nvgraph/test/local_test_data/small_T.mtx
@@ -0,0 +1,19 @@
+%%MatrixMarket matrix coordinate real general
+%%AMGX rhs
+6 6 10
+1 3 0.333333000
+2 1 0.500000000
+2 3 0.333333000
+3 1 0.500000000
+4 5 0.500000000
+4 6 1.000000000
+5 3 0.333333000
+5 4 0.500000000
+6 4 0.500000000
+6 5 0.500000000
+0
+1
+0
+0
+0
+0
diff --git a/cpp/nvgraph/test/log_converter.py b/cpp/nvgraph/test/log_converter.py
new file mode 100755
index 00000000000..2ffe81f8c66
--- /dev/null
+++ b/cpp/nvgraph/test/log_converter.py
@@ -0,0 +1,46 @@
+#!/usr/bin/python
+from sys import argv
+from subprocess import Popen, PIPE, STDOUT
+from os import path, environ
+
+
+def main():
+    args = argv[1:]
+    args[0] = path.join('./', args[0])
+    print args
+    environ["GTEST_PRINT_TIME"] = "0"
+    popen = Popen(args, stdout=PIPE, stderr=STDOUT)
+    stillParsing = True
+    skip = []
+    while not popen.poll():
+        data = popen.stdout.readline().splitlines()
+        if len(data) == 0:
+            break
+        data = data[0]
+        try:
+            STATUS = data[0:12]
+            NAME = data[12:]
+            if data.find('Global test environment tear-down') != -1:
+                stillParsing = False
+            if stillParsing:
+                if STATUS == "[ RUN      ]":
+                    print('&&&& RUNNING' + NAME)
+                elif STATUS == "[       OK ]" and NAME.strip() not in skip:
+                    print('&&&& PASSED ' + NAME)
+                elif STATUS == "[  WAIVED  ]":
+                    print('&&&& WAIVED ' + NAME)
+                    skip.append(NAME.strip())
+                elif STATUS == "[  FAILED  ]":
+                    NAME = NAME.replace(', where', '\n where')
+                    print('&&&& FAILED ' + NAME)
+                else:
+                    print(data)
+            else:
+                print(data)
+        except IndexError:
+            print(data)
+
+    return popen.returncode
+
+if __name__ == '__main__':
+    main()
diff --git a/cpp/nvgraph/test/ref/cpu_ref_SSSP.py b/cpp/nvgraph/test/ref/cpu_ref_SSSP.py
new file mode 100644
index 00000000000..5b14bcd2db1
--- /dev/null
+++ b/cpp/nvgraph/test/ref/cpu_ref_SSSP.py
@@ -0,0 +1,64 @@
+#!/usr/bin/python
+
+# Usage : python3 nvgraph_cpu_ref.py graph.mtx source_vertex
+# This works with networkx 1.8.1 (default ubuntu package version in 14.04)
+# http://networkx.github.io/documentation/networkx-1.8/
+
+# Latest version is currenlty 1.11 in feb 2016
+# https://networkx.github.io/documentation/latest/tutorial/index.html
+
+#import numpy as np
+import sys
+import time
+from scipy.io import mmread
+import numpy as np
+import networkx as nx
+import os
+
+print ('Networkx version : {} '.format(nx.__version__))
+
+# Command line arguments
+argc = len(sys.argv)
+if argc<=2:
+    print("Error: usage is : python3 nvgraph_cpu_ref.py graph.mtx source_vertex")
+    sys.exit()
+mmFile = sys.argv[1]
+src = int(sys.argv[2])
+print('Reading '+ str(mmFile) + '...')
+#Read
+M = mmread(mmFile).asfptype().tolil()
+
+if M is None :
+    raise TypeError('Could not read the input graph')
+
+# in NVGRAPH tests we read as CSR and feed as CSC, so here we doing this explicitly
+M = M.transpose().tocsr()
+if not M.has_sorted_indices:
+    M.sort_indices()
+
+# Directed NetworkX graph
+Gnx = nx.DiGraph(M)
+
+#SSSP
+print('Solving... ')
+t1 = time.time()
+sssp = nx.single_source_dijkstra_path_length(Gnx,source=src)
+t2 =  time.time() - t1
+
+print('Time : '+str(t2))
+print('Writing result ... ')
+
+# fill missing with DBL_MAX
+bsssp = np.full(M.shape[0], sys.float_info.max, dtype=np.float64)
+for r in sssp.keys():
+    bsssp[r] = sssp[r]
+# write binary
+out_fname = os.path.splitext(os.path.basename(mmFile))[0] + '_T.sssp_' + str(src) + '.bin'
+bsssp.tofile(out_fname, "")
+print ('Result is in the file: ' + out_fname)
+
+# write text
+#f = open('/tmp/ref_' + os.path.basename(mmFile) + '_sssp.txt', 'w')
+#f.write(str(sssp.values()))
+
+print('Done')
diff --git a/cpp/nvgraph/test/ref/cpu_ref_pagerank.py b/cpp/nvgraph/test/ref/cpu_ref_pagerank.py
new file mode 100644
index 00000000000..efb8e33c0d8
--- /dev/null
+++ b/cpp/nvgraph/test/ref/cpu_ref_pagerank.py
@@ -0,0 +1,117 @@
+#!/usr/bin/python
+
+# Usage : python3 nvgraph_cpu_ref.py graph.mtx alpha
+# This will convert matrix values to default probabilities
+# This will also write same matrix in CSC format and with dangling notes
+
+#import numpy as np
+import sys
+import time
+from scipy.io import mmread
+import numpy as np
+#import matplotlib.pyplot as plt
+import networkx as nx
+import os
+#from test_pagerank import pagerank
+
+print ('Networkx version : {} '.format(nx.__version__))
+
+# Command line arguments
+argc = len(sys.argv)
+if argc<=2:
+    print("Error: usage is : python3 cpu_ref_pagerank.py graph.mtx alpha")
+    sys.exit()
+mmFile = sys.argv[1]
+alpha = float(sys.argv[2])
+print('Reading '+ str(mmFile) + '...')
+#Read
+M = mmread(mmFile).asfptype()
+nnz_per_row = {r : 0 for r in range(M.get_shape()[0])}
+for nnz in range(M.getnnz()):
+    nnz_per_row[M.row[nnz]] = 1 + nnz_per_row[M.row[nnz]]
+for nnz in range(M.getnnz()):
+    M.data[nnz] = 1.0/float(nnz_per_row[M.row[nnz]])
+
+
+MT = M.transpose(True)
+M = M.tocsr()
+if M is None :
+    raise TypeError('Could not read the input graph')
+if M.shape[0] != M.shape[1]:
+    raise TypeError('Shape is not square')
+
+# should be autosorted, but check just to make sure
+if not M.has_sorted_indices:
+    print('sort_indices ... ')
+    M.sort_indices()
+
+n = M.shape[0]
+dangling = [0]*n 
+for row in range(n):
+    if M.indptr[row] == M.indptr[row+1]:
+        dangling[row] = 1
+    else:
+        pass #M.data[M.indptr[row]:M.indptr[row+1]] = [1.0/float(M.indptr[row+1] - M.indptr[row])]*(M.indptr[row+1] - M.indptr[row])
+#MT.data = M.data
+
+# in NVGRAPH tests we read as CSR and feed as CSC, so here we doing this explicitly
+print('Format conversion ... ')
+
+# Directed NetworkX graph
+print (M.shape[0])
+Gnx = nx.DiGraph(M)
+
+z = {k: 1.0/M.shape[0] for k in range(M.shape[0])}
+
+#SSSP
+print('Solving... ')
+t1 = time.time()
+pr = nx.pagerank(Gnx, alpha=alpha, nstart = z, max_iter=5000, tol = 1e-10) #same parameters as in NVGRAPH
+t2 =  time.time() - t1
+
+print('Time : '+str(t2))
+print('Writing result ... ')
+
+
+'''
+#raw rank results
+# fill missing with DBL_MAX
+bres = np.zeros(M.shape[0], dtype=np.float64)
+for r in pr.keys():
+    bres[r] = pr[r]
+
+print len(pr.keys())
+# write binary
+out_fname = '/tmp/' + os.path.splitext(os.path.basename(mmFile))[0] + '_T.pagerank_' + str(alpha) + '.bin'
+bres.tofile(out_fname, "")
+print 'Result is in the file: ' + out_fname
+'''
+
+#Indexes
+sorted_pr = [item[0] for item in sorted(pr.items(), key=lambda x: x[1])]
+bres = np.array(sorted_pr, dtype = np.int32)
+#print (bres)
+out_fname = os.path.splitext(os.path.basename(mmFile))[0] + '_T.pagerank_idx_' + str(alpha) + '.bin'
+bres.tofile(out_fname, "")
+print ('Vertices index sorted by pageranks in file: ' + out_fname)
+#Values
+out_fname =  os.path.splitext(os.path.basename(mmFile))[0] + '_T.pagerank_val_' + str(alpha) + '.bin'
+#print (np.array(sorted(pr.values()),  dtype = np.float64))
+np.array(sorted(pr.values()),  dtype = np.float64).tofile(out_fname, "")
+print ('Pagerank sorted values in file: ' + out_fname)
+
+print ('Converting and Writing CSC')
+
+b = open(os.path.splitext(os.path.basename(mmFile))[0] + '_T.mtx', "w")
+b.write("%%MatrixMarket matrix coordinate real general\n")
+b.write("%%NVAMG rhs\n")
+b.write("{} {} {}\n".format(n, n, M.getnnz()))
+
+for item in range(MT.getnnz()):
+    b.write("{} {} {}\n".format(MT.row[item] + 1, MT.col[item] + 1, MT.data[item]))
+for val in dangling:
+    b.write(str(val) + "\n")        
+b.close()
+print ("Wrote CSC to the file: "+ os.path.splitext(os.path.basename(mmFile))[0] + '_T.mtx')
+
+print('Done')
diff --git a/cpp/nvgraph/test/ref/cpu_ref_widest.py b/cpp/nvgraph/test/ref/cpu_ref_widest.py
new file mode 100644
index 00000000000..09610fbcdf3
--- /dev/null
+++ b/cpp/nvgraph/test/ref/cpu_ref_widest.py
@@ -0,0 +1,104 @@
+#!/usr/bin/python
+
+# Generates widest path vector for the single source vertex to all other vertices using dijkstra-like algorithm
+
+# Usage : python3 nvgraph_cpu_ref.py graph.mtx source_vertex
+# This works with networkx 1.8.1 (default ubuntu package version in 14.04)
+# http://networkx.github.io/documentation/networkx-1.8/
+
+# Latest version is currenlty 1.11 in feb 2016
+# https://networkx.github.io/documentation/latest/tutorial/index.html
+
+#import numpy as np
+import sys
+import time
+from scipy.io import mmread
+import numpy as np
+import matplotlib.pyplot as plt
+import networkx as nx
+import os
+import sys
+
+#modified widest
+def _dijkstra_custom(G, source, get_weight, cutoff=None):
+    G_succ = G.succ if G.is_directed() else G.adj
+    width = {node: -sys.float_info.max for node in range(G.number_of_nodes())}  # dictionary of final distances
+    width[source] = sys.float_info.max
+    #seen = set()
+    Qset = set([(source, 0)])
+    while len(Qset) > 0:
+        u, depth = Qset.pop()
+        if cutoff:
+            if cutoff < depth:
+                continue
+        #print "Looking at vertex ", u, ", depth = ", depth
+        for v, e in G_succ[u].items():
+            cost = get_weight(u, v, e)
+            #print "Looking at vertex ", u, ", edge to ", v
+            if cost is None:
+                continue
+            alt = max(width[v], min(width[u], cost))
+            if alt > width[v]:
+                width[v] = alt
+                Qset.add((v, depth+1))
+        #print "Updated QSET: ", Qset
+    return width
+
+def single_source_dijkstra_widest(G, source, cutoff=None,
+                                       weight='weight'):
+    if G.is_multigraph():
+        get_weight = lambda u, v, data: min(
+            eattr.get(weight, 1) for eattr in data.values())
+    else:
+        get_weight = lambda u, v, data: data.get(weight, 1)
+
+    return _dijkstra_custom(G, source, get_weight, cutoff=cutoff)
+
+print ('Networkx version : {} '.format(nx.__version__))
+
+# Command line arguments
+argc = len(sys.argv)
+if argc<=2:
+    print("Error: usage is : python3 nvgraph_cpu_ref.py graph.mtx source_vertex")
+    sys.exit()
+mmFile = sys.argv[1]
+src = int(sys.argv[2])
+print('Reading '+ str(mmFile) + '...')
+#Read
+M = mmread(mmFile).transpose()
+
+if M is None :
+    raise TypeError('Could not read the input graph')
+
+# in NVGRAPH tests we read as CSR and feed as CSC, so here we doing this explicitly
+M = M.asfptype().tolil().tocsr()
+if not M.has_sorted_indices:
+    M.sort_indices()
+
+# Directed NetworkX graph
+Gnx = nx.DiGraph(M)
+
+#widest
+print('Solving... ')
+t1 = time.time()
+widest = single_source_dijkstra_widest(Gnx,source=src)
+t2 =  time.time() - t1
+
+print('Time : '+str(t2))
+print('Writing result ... ')
+
+# fill missing with DBL_MAX
+bwidest = np.full(M.shape[0], -sys.float_info.max, dtype=np.float64)
+for r in widest.keys():
+    bwidest[r] = widest[r]
+#print bwidest
+# write binary
+out_fname = os.path.splitext(os.path.basename(mmFile))[0] + '_T.widest_' + str(src) + '.bin'
+bwidest.tofile(out_fname, "")
+print ('Result is in the file: ' + out_fname)
+
+# write text
+#f = open('/tmp/ref_' + os.path.basename(mmFile) + '_widest.txt', 'w')
+#f.write(str(widest.values()))
+
+print('Done')
diff --git a/cpp/nvgraph/test/ref/nerstrand/Makefile b/cpp/nvgraph/test/ref/nerstrand/Makefile
new file mode 100644
index 00000000000..84c4b26bce4
--- /dev/null
+++ b/cpp/nvgraph/test/ref/nerstrand/Makefile
@@ -0,0 +1,17 @@
+CC=g++
+CFLAGS=-O3 -fopenmp 
+LDFLAGS=-I. -L. libnerstrand.a
+EXEC=nerstrand_bench
+SOURCES=nerstrand_driver.cpp mmio.cpp
+OBJECTS=$(SOURCES:.cpp=.o)
+
+$(EXEC): $(OBJECTS)
+	$(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
+
+mmio.o: mmio.cpp mmio.h
+	$(CC) $(CFLAGS) -c $<
+
+nerstand_driver.o: nerstand_driver.cpp mmio.h
+	$(CC) $(CFLAGS) -c $<
+clean:
+	rm *.o
\ No newline at end of file
diff --git a/cpp/nvgraph/test/ref/nerstrand/README.txt b/cpp/nvgraph/test/ref/nerstrand/README.txt
new file mode 100644
index 00000000000..9cf729217e5
--- /dev/null
+++ b/cpp/nvgraph/test/ref/nerstrand/README.txt
@@ -0,0 +1,18 @@
+This is stand alone host app that reads an undirected graph in matrix market format, convert it into CSR, call Nerstrand with default parameters and returns the modularity score of the clustering.   
+
+Make sure you have downloaded and installed nerstrand : http://www-users.cs.umn.edu/~lasalle/nerstrand/
+You should have libnerstrand.a in <nerstrand_directory>/build/Linux-x86_64/lib, move it to the directory containing this README or adjust the Makefile.
+
+Type "make" to compile the small benchmarking app and "./nerstrand_bench <graph> <number of clusters>" to execute.
+For convenience there is also a benchmarking script that calls the benchmarking app (please adjust paths to binary and data sets).
+
+Use the following reference: 
+@article{lasalle2014nerstrand,
+  title={Multi-threaded Modularity Based Graph Clustering using the Multilevel Paradigm},
+  journal = "Journal of Parallel and Distributed Computing ",
+  year = "2014",
+  issn = "0743-7315",
+  doi = "http://dx.doi.org/10.1016/j.jpdc.2014.09.012",
+  url = "http://www.sciencedirect.com/science/article/pii/S0743731514001750",
+  author = "Dominique LaSalle and George Karypis"
+}​
diff --git a/cpp/nvgraph/test/ref/nerstrand/mm_host.hxx b/cpp/nvgraph/test/ref/nerstrand/mm_host.hxx
new file mode 100644
index 00000000000..57ceba6d7bf
--- /dev/null
+++ b/cpp/nvgraph/test/ref/nerstrand/mm_host.hxx
@@ -0,0 +1,259 @@
+#pragma once
+
+#include <stdio.h>
+extern "C" {
+#include "mmio.h"
+}
+
+/// Read matrix properties from Matrix Market file
+/** Matrix Market file is assumed to be a sparse matrix in coordinate
+ *  format.
+ *
+ *  @param f File stream for Matrix Market file.
+ *  @param tg Boolean indicating whether to convert matrix to general
+ *  format (from symmetric, Hermitian, or skew symmetric format).
+ *  @param t (Output) MM_typecode with matrix properties.
+ *  @param m (Output) Number of matrix rows.
+ *  @param n (Output) Number of matrix columns.
+ *  @param nnz (Output) Number of non-zero matrix entries.
+ *  @return Zero if properties were read successfully. Otherwise
+ *  non-zero.
+ */
+template <typename IndexType_>
+int mm_properties(FILE * f, int tg, MM_typecode * t,
+      IndexType_ * m, IndexType_ * n,
+      IndexType_ * nnz) {
+
+  // Read matrix properties from file
+  int mint, nint, nnzint;
+  if(fseek(f,0,SEEK_SET)) {
+    fprintf(stderr, "Error: could not set position in file\n");
+    return -1;
+  }
+  if(mm_read_banner(f,t)) {
+    fprintf(stderr, "Error: could not read Matrix Market file banner\n");
+    return -1;
+  }
+  if(!mm_is_matrix(*t) || !mm_is_coordinate(*t)) {
+    fprintf(stderr, "Error: file does not contain matrix in coordinate format\n");
+    return -1;
+  }
+  if(mm_read_mtx_crd_size(f,&mint,&nint,&nnzint)) {
+    fprintf(stderr, "Error: could not read matrix dimensions\n");
+    return -1;
+  }
+  if(!mm_is_pattern(*t) && !mm_is_real(*t) &&
+     !mm_is_integer(*t) && !mm_is_complex(*t)) {
+    fprintf(stderr, "Error: matrix entries are not valid type\n");
+    return -1;
+  }
+  *m   = mint;
+  *n   = nint;
+  *nnz = nnzint;
+
+  // Find total number of non-zero entries
+  if(tg && !mm_is_general(*t)) {
+
+    // Non-diagonal entries should be counted twice
+    IndexType_ nnzOld = *nnz;
+    *nnz *= 2;
+
+    // Diagonal entries should not be double-counted
+    int i; int st;
+    for(i=0; i<nnzOld; ++i) {
+
+      // Read matrix entry
+      IndexType_ row, col;
+      double rval, ival;
+      if (mm_is_pattern(*t)) 
+          st = fscanf(f, "%d %d\n", &row, &col);
+      else if (mm_is_real(*t) || mm_is_integer(*t))
+          st = fscanf(f, "%d %d %lg\n", &row, &col, &rval);
+      else // Complex matrix
+          st = fscanf(f, "%d %d %lg %lg\n", &row, &col, &rval, &ival);
+      if(ferror(f) || (st == EOF)) {
+          fprintf(stderr, "Error: error %d reading Matrix Market file (entry %d)\n", st, i+1);
+          return -1;
+      }
+
+      // Check if entry is diagonal
+      if(row == col)
+  --(*nnz);
+
+    }
+  }
+
+  return 0;
+
+}
+
+/// Read Matrix Market file and convert to COO format matrix
+/** Matrix Market file is assumed to be a sparse matrix in coordinate
+ *  format.
+ *
+ *  @param f File stream for Matrix Market file.
+ *  @param tg Boolean indicating whether to convert matrix to general
+ *  format (from symmetric, Hermitian, or skew symmetric format).
+ *  @param nnz Number of non-zero matrix entries.
+ *  @param cooRowInd (Output) Row indices for COO matrix. Should have
+ *  at least nnz entries.
+ *  @param cooColInd (Output) Column indices for COO matrix. Should
+ *  have at least nnz entries.
+ *  @param cooRVal (Output) Real component of COO matrix
+ *  entries. Should have at least nnz entries. Ignored if null
+ *  pointer.
+ *  @param cooIVal (Output) Imaginary component of COO matrix
+ *  entries. Should have at least nnz entries. Ignored if null
+ *  pointer.
+ *  @return Zero if matrix was read successfully. Otherwise non-zero.
+ */
+template <typename IndexType_, typename ValueType_>
+int mm_to_coo(FILE *f, int tg, IndexType_ nnz,
+        IndexType_ * cooRowInd, IndexType_ * cooColInd, 
+        ValueType_ * cooRVal  , ValueType_ * cooIVal) {
+  
+  // Read matrix properties from file
+  MM_typecode t;
+  int m, n, nnzOld;
+  if(fseek(f,0,SEEK_SET)) {
+    fprintf(stderr, "Error: could not set position in file\n");
+    return -1;
+  }
+  if(mm_read_banner(f,&t)) {
+    fprintf(stderr, "Error: could not read Matrix Market file banner\n");
+    return -1;
+  }
+  if(!mm_is_matrix(t) || !mm_is_coordinate(t)) {
+    fprintf(stderr, "Error: file does not contain matrix in coordinate format\n");
+    return -1;
+  }
+  if(mm_read_mtx_crd_size(f,&m,&n,&nnzOld)) {
+    fprintf(stderr, "Error: could not read matrix dimensions\n");
+    return -1;
+  }
+  if(!mm_is_pattern(t) && !mm_is_real(t) &&
+     !mm_is_integer(t) && !mm_is_complex(t)) {
+    fprintf(stderr, "Error: matrix entries are not valid type\n");
+    return -1;
+  }
+
+  // Add each matrix entry in file to COO format matrix
+  IndexType_ i;      // Entry index in Matrix Market file
+  IndexType_ j = 0;  // Entry index in COO format matrix
+  for(i=0;i<nnzOld;++i) {
+
+    // Read entry from file
+    int row, col;
+    double rval, ival;
+    int st;
+    if (mm_is_pattern(t)) {
+      st = fscanf(f, "%d %d\n", &row, &col);
+      rval = 1.0;
+      ival = 0.0;
+    }
+    else if (mm_is_real(t) || mm_is_integer(t)) {
+      st = fscanf(f, "%d %d %lg\n", &row, &col, &rval);
+      ival = 0.0;
+    }
+    else // Complex matrix
+      st = fscanf(f, "%d %d %lg %lg\n", &row, &col, &rval, &ival);
+    if(ferror(f) || (st == EOF)) {
+        fprintf(stderr, "Error: error %d reading Matrix Market file (entry %d)\n", st, i+1);
+      return -1;
+    }
+
+    // Switch to 0-based indexing
+    --row;
+    --col;
+
+    // Record entry
+    cooRowInd[j] = row;
+    cooColInd[j] = col;
+    if(cooRVal != NULL)
+      cooRVal[j] = rval;
+    if(cooIVal != NULL)
+      cooIVal[j] = ival;
+    ++j;
+
+    // Add symmetric complement of non-diagonal entries
+    if(tg && !mm_is_general(t) && (row!=col)) {
+
+      // Modify entry value if matrix is skew symmetric or Hermitian
+      if(mm_is_skew(t)) {
+  rval = -rval;
+  ival = -ival;
+      }
+      else if(mm_is_hermitian(t)) {
+  ival = -ival;
+      }
+
+      // Record entry
+      cooRowInd[j] = col;
+      cooColInd[j] = row;
+      if(cooRVal != NULL)
+  cooRVal[j] = rval;
+      if(cooIVal != NULL)
+  cooIVal[j] = ival;
+      ++j;
+      
+    }
+  }
+  return 0;
+
+}
+
+template <typename IndexType_, typename ValueType_>
+void sort(IndexType_ *col_idx, ValueType_ *a, IndexType_ start, IndexType_ end)
+{
+  IndexType_ i, j, it;
+  ValueType_ dt;
+
+  for (i=end-1; i>start; i--)
+    for(j=start; j<i; j++)
+      if (col_idx[j] > col_idx[j+1]){
+
+    if (a){
+      dt=a[j]; 
+      a[j]=a[j+1]; 
+      a[j+1]=dt;
+        }
+    it=col_idx[j]; 
+    col_idx[j]=col_idx[j+1]; 
+    col_idx[j+1]=it;
+      
+      }
+}
+
+template <typename IndexType_, typename ValueType_>
+void coo2csr(IndexType_ n, IndexType_ nz, ValueType_ *a, IndexType_ *i_idx, IndexType_ *j_idx,
+         ValueType_ *csr_a, IndexType_ *col_idx, IndexType_ *row_start)
+{
+  IndexType_ i, l;
+
+  for (i=0; i<=n; i++) row_start[i] = 0;
+
+  /* determine row lengths */
+  for (i=0; i<nz; i++) row_start[i_idx[i]+1]++;
+
+
+  for (i=0; i<n; i++) row_start[i+1] += row_start[i];
+
+
+  /* go through the structure  once more. Fill in output matrix. */
+  for (l=0; l<nz; l++){
+    i = row_start[i_idx[l]];
+    csr_a[i] = a[l];
+    col_idx[i] = j_idx[l];
+    row_start[i_idx[l]]++;
+  }
+
+  /* shift back row_start */
+  for (i=n; i>0; i--) row_start[i] = row_start[i-1];
+
+  row_start[0] = 0;
+
+  for (i=0; i<n; i++){
+    sort (col_idx, csr_a, row_start[i], row_start[i+1]);
+  }
+
+}
diff --git a/cpp/nvgraph/test/ref/nerstrand/mmio.cpp b/cpp/nvgraph/test/ref/nerstrand/mmio.cpp
new file mode 100644
index 00000000000..c250ff2aed9
--- /dev/null
+++ b/cpp/nvgraph/test/ref/nerstrand/mmio.cpp
@@ -0,0 +1,511 @@
+/* 
+*   Matrix Market I/O library for ANSI C
+*
+*   See http://math.nist.gov/MatrixMarket for details.
+*
+*
+*/
+
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <ctype.h>
+
+#include "mmio.h"
+
+int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
+                double **val_, int **I_, int **J_)
+{
+    FILE *f;
+    MM_typecode matcode;
+    int M, N, nz;
+    int i;
+    double *val;
+    int *I, *J;
+ 
+    if ((f = fopen(fname, "r")) == NULL)
+            return -1;
+ 
+ 
+    if (mm_read_banner(f, &matcode) != 0)
+    {
+        printf("mm_read_unsymetric: Could not process Matrix Market banner ");
+        printf(" in file [%s]\n", fname);
+        return -1;
+    }
+ 
+ 
+ 
+    if ( !(mm_is_real(matcode) && mm_is_matrix(matcode) &&
+            mm_is_sparse(matcode)))
+    {
+        fprintf(stderr, "Sorry, this application does not support ");
+        fprintf(stderr, "Market Market type: [%s]\n",
+                mm_typecode_to_str(matcode));
+        return -1;
+    }
+ 
+    /* find out size of sparse matrix: M, N, nz .... */
+ 
+    if (mm_read_mtx_crd_size(f, &M, &N, &nz) !=0)
+    {
+        fprintf(stderr, "read_unsymmetric_sparse(): could not parse matrix size.\n");
+        return -1;
+    }
+ 
+    *M_ = M;
+    *N_ = N;
+    *nz_ = nz;
+ 
+    /* reseve memory for matrices */
+ 
+    I = (int *) malloc(nz * sizeof(int));
+    J = (int *) malloc(nz * sizeof(int));
+    val = (double *) malloc(nz * sizeof(double));
+ 
+    *val_ = val;
+    *I_ = I;
+    *J_ = J;
+ 
+    /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
+    /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
+    /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
+ 
+    for (i=0; i<nz; i++)
+    {
+        fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i]);
+        I[i]--;  /* adjust from 1-based to 0-based */
+        J[i]--;
+    }
+    fclose(f);
+ 
+    return 0;
+}
+
+int mm_is_valid(MM_typecode matcode)
+{
+    if (!mm_is_matrix(matcode)) return 0;
+    if (mm_is_dense(matcode) && mm_is_pattern(matcode)) return 0;
+    if (mm_is_real(matcode) && mm_is_hermitian(matcode)) return 0;
+    if (mm_is_pattern(matcode) && (mm_is_hermitian(matcode) || 
+                mm_is_skew(matcode))) return 0;
+    return 1;
+}
+
+int mm_read_banner(FILE *f, MM_typecode *matcode)
+{
+    char line[MM_MAX_LINE_LENGTH];
+    char banner[MM_MAX_TOKEN_LENGTH];
+    char mtx[MM_MAX_TOKEN_LENGTH]; 
+    char crd[MM_MAX_TOKEN_LENGTH];
+    char data_type[MM_MAX_TOKEN_LENGTH];
+    char storage_scheme[MM_MAX_TOKEN_LENGTH];
+    char *p;
+
+
+    mm_clear_typecode(matcode);  
+
+    if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL) 
+        return MM_PREMATURE_EOF;
+
+    if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, data_type, 
+        storage_scheme) != 5)
+        return MM_PREMATURE_EOF;
+
+    for (p=mtx; *p!='\0'; *p=tolower(*p),p++);  /* convert to lower case */
+    for (p=crd; *p!='\0'; *p=tolower(*p),p++);  
+    for (p=data_type; *p!='\0'; *p=tolower(*p),p++);
+    for (p=storage_scheme; *p!='\0'; *p=tolower(*p),p++);
+
+    /* check for banner */
+    if (strncmp(banner, MatrixMarketBanner, strlen(MatrixMarketBanner)) != 0)
+        return MM_NO_HEADER;
+
+    /* first field should be "mtx" */
+    if (strcmp(mtx, MM_MTX_STR) != 0)
+        return  MM_UNSUPPORTED_TYPE;
+    mm_set_matrix(matcode);
+
+
+    /* second field describes whether this is a sparse matrix (in coordinate
+            storgae) or a dense array */
+
+
+    if (strcmp(crd, MM_SPARSE_STR) == 0)
+        mm_set_sparse(matcode);
+    else
+    if (strcmp(crd, MM_DENSE_STR) == 0)
+            mm_set_dense(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+    
+
+    /* third field */
+
+    if (strcmp(data_type, MM_REAL_STR) == 0)
+        mm_set_real(matcode);
+    else
+    if (strcmp(data_type, MM_COMPLEX_STR) == 0)
+        mm_set_complex(matcode);
+    else
+    if (strcmp(data_type, MM_PATTERN_STR) == 0)
+        mm_set_pattern(matcode);
+    else
+    if (strcmp(data_type, MM_INT_STR) == 0)
+        mm_set_integer(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+    
+
+    /* fourth field */
+
+    if (strcmp(storage_scheme, MM_GENERAL_STR) == 0)
+        mm_set_general(matcode);
+    else
+    if (strcmp(storage_scheme, MM_SYMM_STR) == 0)
+        mm_set_symmetric(matcode);
+    else
+    if (strcmp(storage_scheme, MM_HERM_STR) == 0)
+        mm_set_hermitian(matcode);
+    else
+    if (strcmp(storage_scheme, MM_SKEW_STR) == 0)
+        mm_set_skew(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+        
+
+    return 0;
+}
+
+int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz)
+{
+    if (fprintf(f, "%d %d %d\n", M, N, nz) != 3)
+        return MM_COULD_NOT_WRITE_FILE;
+    else 
+        return 0;
+}
+
+int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz )
+{
+    char line[MM_MAX_LINE_LENGTH];
+    int num_items_read;
+
+    /* set return null parameter values, in case we exit with errors */
+    *M = *N = *nz = 0;
+
+    /* now continue scanning until you reach the end-of-comments */
+    do 
+    {
+        if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) 
+            return MM_PREMATURE_EOF;
+    }while (line[0] == '%');
+
+    /* line[] is either blank or has M,N, nz */
+    if (sscanf(line, "%d %d %d", M, N, nz) == 3)
+        return 0;
+        
+    else
+    do
+    { 
+        num_items_read = fscanf(f, "%d %d %d", M, N, nz); 
+        if (num_items_read == EOF) return MM_PREMATURE_EOF;
+    }
+    while (num_items_read != 3);
+
+    return 0;
+}
+
+
+int mm_read_mtx_array_size(FILE *f, int *M, int *N)
+{
+    char line[MM_MAX_LINE_LENGTH];
+    int num_items_read;
+    /* set return null parameter values, in case we exit with errors */
+    *M = *N = 0;
+	
+    /* now continue scanning until you reach the end-of-comments */
+    do 
+    {
+        if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) 
+            return MM_PREMATURE_EOF;
+    }while (line[0] == '%');
+
+    /* line[] is either blank or has M,N, nz */
+    if (sscanf(line, "%d %d", M, N) == 2)
+        return 0;
+        
+    else /* we have a blank line */
+    do
+    { 
+        num_items_read = fscanf(f, "%d %d", M, N); 
+        if (num_items_read == EOF) return MM_PREMATURE_EOF;
+    }
+    while (num_items_read != 2);
+
+    return 0;
+}
+
+int mm_write_mtx_array_size(FILE *f, int M, int N)
+{
+    if (fprintf(f, "%d %d\n", M, N) != 2)
+        return MM_COULD_NOT_WRITE_FILE;
+    else 
+        return 0;
+}
+
+
+
+/*-------------------------------------------------------------------------*/
+
+/******************************************************************/
+/* use when I[], J[], and val[]J, and val[] are already allocated */
+/******************************************************************/
+
+int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
+        double val[], MM_typecode matcode)
+{
+    int i;
+    if (mm_is_complex(matcode))
+    {
+        for (i=0; i<nz; i++)
+            if (fscanf(f, "%d %d %lg %lg", &I[i], &J[i], &val[2*i], &val[2*i+1])
+                != 4) return MM_PREMATURE_EOF;
+    }
+    else if (mm_is_real(matcode))
+    {
+        for (i=0; i<nz; i++)
+        {
+            if (fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i])
+                != 3) return MM_PREMATURE_EOF;
+
+        }
+    }
+
+    else if (mm_is_pattern(matcode))
+    {
+        for (i=0; i<nz; i++)
+            if (fscanf(f, "%d %d", &I[i], &J[i])
+                != 2) return MM_PREMATURE_EOF;
+    }
+    else
+        return MM_UNSUPPORTED_TYPE;
+
+    return 0;
+        
+}
+
+int mm_read_mtx_crd_entry(FILE *f, int *I, int *J,
+        double *real, double *imag, MM_typecode matcode)
+{
+    if (mm_is_complex(matcode))
+    {
+            if (fscanf(f, "%d %d %lg %lg", I, J, real, imag)
+                != 4) return MM_PREMATURE_EOF;
+    }
+    else if (mm_is_real(matcode))
+    {
+            if (fscanf(f, "%d %d %lg\n", I, J, real)
+                != 3) return MM_PREMATURE_EOF;
+
+    }
+
+    else if (mm_is_pattern(matcode))
+    {
+            if (fscanf(f, "%d %d", I, J) != 2) return MM_PREMATURE_EOF;
+    }
+    else
+        return MM_UNSUPPORTED_TYPE;
+
+    return 0;
+        
+}
+
+
+/************************************************************************
+    mm_read_mtx_crd()  fills M, N, nz, array of values, and return
+                        type code, e.g. 'MCRS'
+
+                        if matrix is complex, values[] is of size 2*nz,
+                            (nz pairs of real/imaginary values)
+************************************************************************/
+
+int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J, 
+        double **val, MM_typecode *matcode)
+{
+    int ret_code;
+    FILE *f;
+
+    if (strcmp(fname, "stdin") == 0) f=stdin;
+    else
+    if ((f = fopen(fname, "r")) == NULL)
+        return MM_COULD_NOT_READ_FILE;
+
+
+    if ((ret_code = mm_read_banner(f, matcode)) != 0)
+        return ret_code;
+
+    if (!(mm_is_valid(*matcode) && mm_is_sparse(*matcode) && 
+            mm_is_matrix(*matcode)))
+        return MM_UNSUPPORTED_TYPE;
+
+    if ((ret_code = mm_read_mtx_crd_size(f, M, N, nz)) != 0)
+        return ret_code;
+
+
+    *I = (int *)  malloc(*nz * sizeof(int));
+    *J = (int *)  malloc(*nz * sizeof(int));
+    *val = NULL;
+
+    if (mm_is_complex(*matcode))
+    {
+        *val = (double *) malloc(*nz * 2 * sizeof(double));
+        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
+                *matcode);
+        if (ret_code != 0) return ret_code;
+    }
+    else if (mm_is_real(*matcode))
+    {
+        *val = (double *) malloc(*nz * sizeof(double));
+        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
+                *matcode);
+        if (ret_code != 0) return ret_code;
+    }
+
+    else if (mm_is_pattern(*matcode))
+    {
+        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
+                *matcode);
+        if (ret_code != 0) return ret_code;
+    }
+
+    if (f != stdin) fclose(f);
+    return 0;
+}
+
+int mm_write_banner(FILE *f, MM_typecode matcode)
+{
+    char *str = mm_typecode_to_str(matcode);
+    int ret_code;
+
+    ret_code = fprintf(f, "%s %s\n", MatrixMarketBanner, str);
+    free(str);
+    if (ret_code !=2 )
+        return MM_COULD_NOT_WRITE_FILE;
+    else
+        return 0;
+}
+
+int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
+        double val[], MM_typecode matcode)
+{
+    FILE *f;
+    int i;
+
+    if (strcmp(fname, "stdout") == 0) 
+        f = stdout;
+    else
+    if ((f = fopen(fname, "w")) == NULL)
+        return MM_COULD_NOT_WRITE_FILE;
+    
+    /* print banner followed by typecode */
+    fprintf(f, "%s ", MatrixMarketBanner);
+    fprintf(f, "%s\n", mm_typecode_to_str(matcode));
+
+    /* print matrix sizes and nonzeros */
+    fprintf(f, "%d %d %d\n", M, N, nz);
+
+    /* print values */
+    if (mm_is_pattern(matcode))
+        for (i=0; i<nz; i++)
+            fprintf(f, "%d %d\n", I[i], J[i]);
+    else
+    if (mm_is_real(matcode))
+        for (i=0; i<nz; i++)
+            fprintf(f, "%d %d %20.16g\n", I[i], J[i], val[i]);
+    else
+    if (mm_is_complex(matcode))
+        for (i=0; i<nz; i++)
+            fprintf(f, "%d %d %20.16g %20.16g\n", I[i], J[i], val[2*i], 
+                        val[2*i+1]);
+    else
+    {
+        if (f != stdout) fclose(f);
+        return MM_UNSUPPORTED_TYPE;
+    }
+
+    if (f !=stdout) fclose(f);
+
+    return 0;
+}
+  
+
+/**
+*  Create a new copy of a string s.  mm_strdup() is a common routine, but
+*  not part of ANSI C, so it is included here.  Used by mm_typecode_to_str().
+*
+*/
+char *mm_strdup(const char *s)
+{
+	int len = strlen(s);
+	char *s2 = (char *) malloc((len+1)*sizeof(char));
+	return strcpy(s2, s);
+}
+
+char  *mm_typecode_to_str(MM_typecode matcode)
+{
+    char buffer[MM_MAX_LINE_LENGTH];
+    char *types[4];
+	char *mm_strdup(const char *);
+    int error =0;
+
+    /* check for MTX type */
+    if (mm_is_matrix(matcode)) 
+        types[0] = MM_MTX_STR;
+    else
+        error=1;
+
+    /* check for CRD or ARR matrix */
+    if (mm_is_sparse(matcode))
+        types[1] = MM_SPARSE_STR;
+    else
+    if (mm_is_dense(matcode))
+        types[1] = MM_DENSE_STR;
+    else
+        return NULL;
+
+    /* check for element data type */
+    if (mm_is_real(matcode))
+        types[2] = MM_REAL_STR;
+    else
+    if (mm_is_complex(matcode))
+        types[2] = MM_COMPLEX_STR;
+    else
+    if (mm_is_pattern(matcode))
+        types[2] = MM_PATTERN_STR;
+    else
+    if (mm_is_integer(matcode))
+        types[2] = MM_INT_STR;
+    else
+        return NULL;
+
+
+    /* check for symmetry type */
+    if (mm_is_general(matcode))
+        types[3] = MM_GENERAL_STR;
+    else
+    if (mm_is_symmetric(matcode))
+        types[3] = MM_SYMM_STR;
+    else 
+    if (mm_is_hermitian(matcode))
+        types[3] = MM_HERM_STR;
+    else 
+    if (mm_is_skew(matcode))
+        types[3] = MM_SKEW_STR;
+    else
+        return NULL;
+
+    sprintf(buffer,"%s %s %s %s", types[0], types[1], types[2], types[3]);
+    return mm_strdup(buffer);
+
+}
diff --git a/cpp/nvgraph/test/ref/nerstrand/mmio.h b/cpp/nvgraph/test/ref/nerstrand/mmio.h
new file mode 100644
index 00000000000..7cfd0a1b7ae
--- /dev/null
+++ b/cpp/nvgraph/test/ref/nerstrand/mmio.h
@@ -0,0 +1,133 @@
+/* 
+*   Matrix Market I/O library for ANSI C
+*
+*   See http://math.nist.gov/MatrixMarket for details.
+*
+*
+*/
+
+#ifndef MM_IO_H
+#define MM_IO_H
+
+#define MM_MAX_LINE_LENGTH 1025
+#define MatrixMarketBanner "%%MatrixMarket"
+#define MM_MAX_TOKEN_LENGTH 64
+
+typedef char MM_typecode[4];
+
+char *mm_typecode_to_str(MM_typecode matcode);
+
+int mm_read_banner(FILE *f, MM_typecode *matcode);
+int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz);
+int mm_read_mtx_array_size(FILE *f, int *M, int *N);
+
+int mm_write_banner(FILE *f, MM_typecode matcode);
+int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz);
+int mm_write_mtx_array_size(FILE *f, int M, int N);
+
+
+/********************* MM_typecode query fucntions ***************************/
+
+#define mm_is_matrix(typecode)	((typecode)[0]=='M')
+
+#define mm_is_sparse(typecode)	((typecode)[1]=='C')
+#define mm_is_coordinate(typecode)((typecode)[1]=='C')
+#define mm_is_dense(typecode)	((typecode)[1]=='A')
+#define mm_is_array(typecode)	((typecode)[1]=='A')
+
+#define mm_is_complex(typecode)	((typecode)[2]=='C')
+#define mm_is_real(typecode)		((typecode)[2]=='R')
+#define mm_is_pattern(typecode)	((typecode)[2]=='P')
+#define mm_is_integer(typecode) ((typecode)[2]=='I')
+
+#define mm_is_symmetric(typecode)((typecode)[3]=='S')
+#define mm_is_general(typecode)	((typecode)[3]=='G')
+#define mm_is_skew(typecode)	((typecode)[3]=='K')
+#define mm_is_hermitian(typecode)((typecode)[3]=='H')
+
+int mm_is_valid(MM_typecode matcode);		/* too complex for a macro */
+
+
+/********************* MM_typecode modify fucntions ***************************/
+
+#define mm_set_matrix(typecode)	((*typecode)[0]='M')
+#define mm_set_coordinate(typecode)	((*typecode)[1]='C')
+#define mm_set_array(typecode)	((*typecode)[1]='A')
+#define mm_set_dense(typecode)	mm_set_array(typecode)
+#define mm_set_sparse(typecode)	mm_set_coordinate(typecode)
+
+#define mm_set_complex(typecode)((*typecode)[2]='C')
+#define mm_set_real(typecode)	((*typecode)[2]='R')
+#define mm_set_pattern(typecode)((*typecode)[2]='P')
+#define mm_set_integer(typecode)((*typecode)[2]='I')
+
+
+#define mm_set_symmetric(typecode)((*typecode)[3]='S')
+#define mm_set_general(typecode)((*typecode)[3]='G')
+#define mm_set_skew(typecode)	((*typecode)[3]='K')
+#define mm_set_hermitian(typecode)((*typecode)[3]='H')
+
+#define mm_clear_typecode(typecode) ((*typecode)[0]=(*typecode)[1]= \
+									(*typecode)[2]=' ',(*typecode)[3]='G')
+
+#define mm_initialize_typecode(typecode) mm_clear_typecode(typecode)
+
+
+/********************* Matrix Market error codes ***************************/
+
+
+#define MM_COULD_NOT_READ_FILE	11
+#define MM_PREMATURE_EOF		12
+#define MM_NOT_MTX				13
+#define MM_NO_HEADER			14
+#define MM_UNSUPPORTED_TYPE		15
+#define MM_LINE_TOO_LONG		16
+#define MM_COULD_NOT_WRITE_FILE	17
+
+
+/******************** Matrix Market internal definitions ********************
+
+   MM_matrix_typecode: 4-character sequence
+
+				    ojbect 		sparse/   	data        storage 
+						  		dense     	type        scheme
+
+   string position:	 [0]        [1]			[2]         [3]
+
+   Matrix typecode:  M(atrix)  C(oord)		R(eal)   	G(eneral)
+						        A(array)	C(omplex)   H(ermitian)
+											P(attern)   S(ymmetric)
+								    		I(nteger)	K(kew)
+
+ ***********************************************************************/
+
+#define MM_MTX_STR		"matrix"
+#define MM_ARRAY_STR	"array"
+#define MM_DENSE_STR	"array"
+#define MM_COORDINATE_STR "coordinate" 
+#define MM_SPARSE_STR	"coordinate"
+#define MM_COMPLEX_STR	"complex"
+#define MM_REAL_STR		"real"
+#define MM_INT_STR		"integer"
+#define MM_GENERAL_STR  "general"
+#define MM_SYMM_STR		"symmetric"
+#define MM_HERM_STR		"hermitian"
+#define MM_SKEW_STR		"skew-symmetric"
+#define MM_PATTERN_STR  "pattern"
+
+
+/*  high level routines */
+
+int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
+		 double val[], MM_typecode matcode);
+int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
+		double val[], MM_typecode matcode);
+int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img,
+			MM_typecode matcode);
+
+int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
+                double **val_, int **I_, int **J_);
+
+
+
+#endif
diff --git a/cpp/nvgraph/test/ref/nerstrand/nerstrand.h b/cpp/nvgraph/test/ref/nerstrand/nerstrand.h
new file mode 100644
index 00000000000..acb7e379397
--- /dev/null
+++ b/cpp/nvgraph/test/ref/nerstrand/nerstrand.h
@@ -0,0 +1,321 @@
+/**
+ * @file nerstrand.h
+ * @brief Main header for for nerstrand 
+ * @author Dominique LaSalle <lasalle@cs.umn.edu>
+ * Copyright 2013, Regents of the University of Minnesota
+ * @version 1
+ * @date 2014-01-27
+ */
+
+
+
+
+#ifndef NERSTRAND_H
+#define NERSTRAND_H
+
+
+
+
+#include <stdint.h>
+#include <float.h>
+#include <unistd.h>
+
+
+
+
+/******************************************************************************
+* VERSION *********************************************************************
+******************************************************************************/
+
+
+#define NERSTRAND_VER_MAJOR 0
+#define NERSTRAND_VER_MINOR 5
+#define NERSTRAND_VER_SUBMINOR 0
+
+
+
+
+/******************************************************************************
+* TYPES ***********************************************************************
+******************************************************************************/
+
+
+#ifndef NERSTRAND_GRAPH_TYPES_DEFINED
+#ifdef NERSTRAND_64BIT_VERTICES
+typedef uint64_t vtx_t;
+#else
+typedef uint32_t vtx_t;
+#endif
+#ifdef NERSTRAND_64BIT_EDGES
+typedef uint64_t adj_t;
+#else
+typedef uint32_t adj_t;
+#endif
+#ifdef NERSTRAND_DOUBLE_WEIGHTS
+typedef double wgt_t;
+#else
+typedef float wgt_t;
+#endif
+#endif /* NERSTRAND_GRAPH_TYPES_DEFINED */
+
+
+#ifdef NERSTRAND_64BIT_CLUSTERS
+typedef uint64_t cid_t;
+#else
+typedef uint32_t cid_t;
+#endif
+
+
+
+
+/******************************************************************************
+* ENUMS ***********************************************************************
+******************************************************************************/
+
+
+typedef enum nerstrand_error_t {
+  NERSTRAND_SUCCESS = 1,
+  NERSTRAND_ERROR_INVALIDOPTIONS,
+  NERSTRAND_ERROR_INVALIDINPUT,
+  NERSTRAND_ERROR_NOTENOUGHMEMORY,
+  NERSTRAND_ERROR_UNIMPLEMENTED,
+  NERSTRAND_ERROR_UNKNOWN
+} nerstrand_error_t;
+
+
+typedef enum nerstrand_option_t {
+  NERSTRAND_OPTION_HELP,
+  NERSTRAND_OPTION_NCLUSTERS,
+  NERSTRAND_OPTION_NTHREADS,
+  NERSTRAND_OPTION_SEED,
+  NERSTRAND_OPTION_NRUNS,
+  NERSTRAND_OPTION_NREFPASS,
+  NERSTRAND_OPTION_NINITSOLUTIONS,
+  NERSTRAND_OPTION_AGGTYPE,
+  NERSTRAND_OPTION_CONTYPE,
+  NERSTRAND_OPTION_SPATYPE,
+  NERSTRAND_OPTION_DISTYPE,
+  NERSTRAND_OPTION_REFTYPE,
+  NERSTRAND_OPTION_INITYPE,
+  NERSTRAND_OPTION_PARTYPE,
+  NERSTRAND_OPTION_VERBOSITY,
+  NERSTRAND_OPTION_AGG_RATE,
+  NERSTRAND_OPTION_CNVTXS_PER_CLUSTER,
+  NERSTRAND_OPTION_MAXREFMOVES,
+  NERSTRAND_OPTION_TIME,
+  NERSTRAND_OPTION_MODSTATS,
+  NERSTRAND_OPTION_ICSTATS,
+  NERSTRAND_OPTION_LBSTATS,
+  NERSTRAND_OPTION_AGGSTATS,
+  NERSTRAND_OPTION_REFSTATS,
+  NERSTRAND_OPTION_SUPERNODE_RATIO,
+  NERSTRAND_OPTION_STOPRATIO,
+  NERSTRAND_OPTION_STOPCONDITION,
+  NERSTRAND_OPTION_DEGREE_WEIGHT,
+  NERSTRAND_OPTION_BLOCKSIZE,
+  NERSTRAND_OPTION_DISTRIBUTION,
+  NERSTRAND_OPTION_RESTEP,
+  __NERSTRAND_OPTION_TERM
+} nerstrand_option_t;
+
+
+typedef enum nerstrand_parttype_t {
+  NERSTRAND_PARTITION_KWAY,
+  NERSTRAND_PARTITION_ANYWAY
+} nerstrand_parttype_t;
+
+
+typedef enum nerstrand_aggtype_t {
+  NERSTRAND_AGGREGATE_RM,
+  NERSTRAND_AGGREGATE_SHEM,
+  NERSTRAND_AGGREGATE_AGM,
+  NERSTRAND_AGGREGATE_AGH,
+  NERSTRAND_AGGREGATE_RC,
+  NERSTRAND_AGGREGATE_FC,
+  NERSTRAND_AGGREGATE_AGC
+} nerstrand_aggtype_t;
+
+
+typedef enum nerstrand_sparsifytype_t {
+  NERSTRAND_SPARSIFY_NONE,
+  NERSTRAND_SPARSIFY_RANDOM,
+  NERSTRAND_SPARSIFY_LIGHT,
+  NERSTRAND_SPARSIFY_HEAVY,
+  NERSTRAND_SPARSIFY_DEGREE
+} nerstrand_sparsifytype_t;
+
+
+typedef enum nerstrand_edgeremovaltype_t {
+  NERSTRAND_EDGEREMOVAL_DROP,
+  NERSTRAND_EDGEREMOVAL_LOOP,
+  NERSTRAND_EDGEREMOVAL_DISTRIBUTE,
+  NERSTRAND_EDGEREMOVAL_PHANTOM
+} nerstrand_edgeremovaltype_t;
+
+
+typedef enum nerstrand_ictype_t {
+  NERSTRAND_INITIAL_CLUSTERING_BFS,
+  NERSTRAND_INITIAL_CLUSTERING_RANDOM,
+  NERSTRAND_INITIAL_CLUSTERING_SEED,
+  NERSTRAND_INITIAL_CLUSTERING_NEWMAN,
+  NERSTRAND_INITIAL_CLUSTERING_LP,
+  NERSTRAND_INITIAL_CLUSTERING_GROW,
+  NERSTRAND_INITIAL_CLUSTERING_GROWKL,
+  NERSTRAND_INITIAL_CLUSTERING_VTX,
+  NERSTRAND_INITIAL_CLUSTERING_RVTX
+} nerstrand_ictype_t;
+
+
+typedef enum nerstrand_contype_t {
+  NERSTRAND_CONTRACT_SUM
+} nerstrand_contype_t;
+
+
+typedef enum nerstrand_projtype_t {
+  NERSTRAND_PROJECT_DIRECT,
+  NERSTRAND_PROJECT_SPARSE
+} nerstrand_projtype_t;
+
+
+typedef enum nerstrand_reftype_t {
+  NERSTRAND_REFINEMENT_GREEDY,
+  NERSTRAND_REFINEMENT_RANDOM
+} nerstrand_reftype_t;
+
+
+typedef enum nerstrand_verbosity_t {
+  NERSTRAND_VERBOSITY_MINIMUM=10,
+  NERSTRAND_VERBOSITY_LOW=20,
+  NERSTRAND_VERBOSITY_MEDIUM=30,
+  NERSTRAND_VERBOSITY_HIGH=40,
+  NERSTRAND_VERBOSITY_MAXIMUM=50
+} nerstrand_verbosity_t;
+
+
+typedef enum nerstrand_stopcondition_t {
+  NERSTRAND_STOPCONDITION_EDGES,
+  NERSTRAND_STOPCONDITION_VERTICES,
+  NERSTRAND_STOPCONDITION_SIZE
+} nerstrand_stopcondition_t;
+
+
+typedef enum nerstrand_distribution_t {
+  NERSTRAND_DISTRIBUTION_BLOCK,
+  NERSTRAND_DISTRIBUTION_CYCLIC,
+  NERSTRAND_DISTRIBUTION_BLOCKCYCLIC
+} nerstrand_distribution_t;
+
+
+
+
+/******************************************************************************
+* CONSTANTS *******************************************************************
+******************************************************************************/
+
+
+static const size_t NERSTRAND_NOPTIONS = __NERSTRAND_OPTION_TERM;
+static const double NERSTRAND_VAL_OFF = -DBL_MAX;
+
+
+
+
+/******************************************************************************
+* FUNCTION PROTOTYPES *********************************************************
+******************************************************************************/
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/**
+ * @brief Allocate and initialize a set of options for use with the
+ * nerstrand_cluster_explicit() function.
+ *
+ * @return The allocated and initialized options. 
+ */
+double * nerstrand_init_options(void);
+
+
+/**
+ * @brief Generate a clustering of a graph with a speficied set of options. 
+ *
+ * @param r_nvtxs A pointer to the number of vertices in the graph.
+ * @param xadj The start of the adjacency list of each vertex.
+ * @param adjncy The vertex at the far end of each edge, indexed by xadj.
+ * @param adjwgt The weight of each edge, indexed by xadj.
+ * @param options The options array specifying the parameters for generating
+ * the clustering.
+ * @param r_nclusters A pointer to the number of clusters.
+ * @param cid The cluster assignment for each vertex.
+ * @param r_mod A pointer to the modularity of the generated clustering.
+ *
+ * @return NERSTRAND_SUCCESS unless an error is encountered. 
+ */
+int nerstrand_cluster_explicit(
+    vtx_t const * r_nvtxs,
+    adj_t const * xadj,
+    vtx_t const * adjncy,
+    wgt_t const * adjwgt,
+    double const * options,
+    cid_t * r_nclusters,
+    cid_t * cid,
+    double * r_mod);
+
+
+/**
+ * @brief Generate a clustering of a graph with specified number of clusters. 
+ *
+ * @param r_nvtxs A pointer to the number of vertices in the graph.
+ * @param xadj The start of the adjacency list of each vertex.
+ * @param adjncy The vertex at the far end of each edge, indexed by xadj.
+ * @param adjwgt The weight of each edge, indexed by xadj.
+ * @param r_nclusters A pointer to the number of clusters.
+ * @param cid The cluster assignment for each vertex.
+ * @param r_mod A pointer to the modularity of the generated clustering.
+ *
+ * @return NERSTRAND_SUCCESS unless an error is encountered. 
+ */
+int nerstrand_cluster_kway(
+    vtx_t const * r_nvtxs, 
+    adj_t const * xadj, 
+    vtx_t const * adjncy, 
+    wgt_t const * adjwgt, 
+    cid_t const * r_nclusters, 
+    cid_t * cid, 
+    double * r_mod);
+
+
+/**
+ * @brief Generate a clustering of a graph with an unspecified number of
+ * clusters. 
+ *
+ * @param r_nvtxs A pointer to the number of vertices in the graph.
+ * @param xadj The start of the adjacency list of each vertex.
+ * @param adjncy The vertex at the far end of each edge, indexed by xadj.
+ * @param adjwgt The weight of each edge, indexed by xadj.
+ * @param r_nclusters A pointer to the number of clusters.
+ * @param cid The cluster assignment for each vertex.
+ * @param r_mod A pointer to the modularity of the generated clustering.
+ *
+ * @return NERSTRAND_SUCCESS unless an error is encountered. 
+ */
+int nerstrand_cluster_anyway(
+    vtx_t const * r_nvtxs, 
+    adj_t const * xadj, 
+    vtx_t const * adjncy, 
+    wgt_t const * adjwgt, 
+    cid_t * r_nclusters, 
+    cid_t * cid, 
+    double * r_mod);
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/cpp/nvgraph/test/ref/nerstrand/nerstrand_driver.cpp b/cpp/nvgraph/test/ref/nerstrand/nerstrand_driver.cpp
new file mode 100644
index 00000000000..96ad36172aa
--- /dev/null
+++ b/cpp/nvgraph/test/ref/nerstrand/nerstrand_driver.cpp
@@ -0,0 +1,73 @@
+#include <stdio.h>
+#include <stddef.h>
+#include <iostream>
+#include <stdlib.h> 
+#include <vector>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/sysinfo.h>
+
+#include "mmio.h"
+
+#include "mm_host.hxx"
+#include "nerstrand.h"
+
+
+static double second (void)
+{
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
+}
+
+
+int main(int argc, const char **argv) 
+{
+
+    int m, n, nnz;
+    double start, stop,r_mod;
+    cid_t n_clusters;
+    MM_typecode mc;
+    if (argc != 3)
+    {
+        std::cout<<"Usage : ./nerstrand_bench <graph> <number of clusters>"<<std::endl;
+        exit(0);
+    }
+    FILE* fpin = fopen(argv[1],"r");
+    n_clusters = atoi(argv[2]);
+    
+    mm_properties<int>(fpin, 1, &mc, &m, &n, &nnz) ;
+    
+    // Allocate memory on host
+    std::vector<int> cooRowIndA(nnz);
+    std::vector<int> cooColIndA(nnz);
+    std::vector<double> cooValA(nnz);
+    std::vector<int> csrRowPtrA(n+1);
+    std::vector<int> csrColIndA(nnz);
+    std::vector<double> csrValA(nnz);
+
+    
+    mm_to_coo<int,double>(fpin, 1, nnz, &cooRowIndA[0], &cooColIndA[0], &cooValA[0],NULL) ;
+    coo2csr<int,double> (n, nnz, &cooValA[0],  &cooRowIndA[0],  &cooColIndA[0], &csrValA[0], &csrColIndA[0],&csrRowPtrA[0]);
+    fclose(fpin);   
+
+    vtx_t nerstrand_n = static_cast<vtx_t>(n);
+    std::vector<adj_t> nerstrand_csrRowPtrA(csrRowPtrA.begin(), csrRowPtrA.end());
+    std::vector<vtx_t> nerstrand_csrColIndA(csrColIndA.begin(), csrColIndA.end());
+    std::vector<wgt_t> nerstrand_csrValA(csrValA.begin(), csrValA.end());
+    std::vector<cid_t> clustering(n);
+
+    start = second();
+    start = second();
+    #pragma omp_parallel
+    {
+    int nerstrand_status = nerstrand_cluster_kway(&nerstrand_n, &nerstrand_csrRowPtrA[0],&nerstrand_csrColIndA[0], &nerstrand_csrValA[0], &n_clusters, &clustering[0], &r_mod);
+    if (nerstrand_status != NERSTRAND_SUCCESS) 
+        std::cout<<"nerstrand execution failed"<<std::endl;
+    
+
+    }
+        stop = second();
+
+    std::cout<<r_mod<<","<<stop-start<<std::endl;
+}
\ No newline at end of file
diff --git a/cpp/nvgraph/test/ref/nerstrand/nestrand.sh b/cpp/nvgraph/test/ref/nerstrand/nestrand.sh
new file mode 100644
index 00000000000..3c651d9cfbb
--- /dev/null
+++ b/cpp/nvgraph/test/ref/nerstrand/nestrand.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+nvg_data_prefix="/home/mnaumov/cuda_matrices/p4matrices/dimacs10"
+
+declare -a dataset=(
+"$nvg_data_prefix/preferentialAttachment.mtx"
+"$nvg_data_prefix/caidaRouterLevel.mtx"
+"$nvg_data_prefix/coAuthorsDBLP.mtx"
+"$nvg_data_prefix/citationCiteseer.mtx"
+"$nvg_data_prefix/coPapersDBLP.mtx"
+"$nvg_data_prefix/coPapersCiteseer.mtx"
+"/home/afender/modularity/as-Skitter.mtx"
+"/home/afender/modularity/hollywood-2009.mtx"
+)
+
+for i in "${dataset[@]}"
+do
+   ./nerstrand_bench "$i" 7 
+done
+echo 
+
+#run only best case according to Spreadsheet 1
+./nerstrand_bench "$nvg_data_prefix/preferentialAttachment.mtx" 7
+./nerstrand_bench "$nvg_data_prefix/caidaRouterLevel.mtx" 11
+./nerstrand_bench "$nvg_data_prefix/coAuthorsDBLP.mtx" 7
+./nerstrand_bench "$nvg_data_prefix/citationCiteseer.mtx" 17
+./nerstrand_bench "$nvg_data_prefix/coPapersDBLP.mtx" 73
+./nerstrand_bench "$nvg_data_prefix/coPapersCiteseer.mtx" 53
+./nerstrand_bench "/home/afender/modularity/as-Skitter.mtx" 7
+./nerstrand_bench "/home/afender/modularity/hollywood-2009.mtx" 11
diff --git a/cpp/nvgraph/test/ref/ref_sssp_BGL.cpp b/cpp/nvgraph/test/ref/ref_sssp_BGL.cpp
new file mode 100644
index 00000000000..4e3c81fb82e
--- /dev/null
+++ b/cpp/nvgraph/test/ref/ref_sssp_BGL.cpp
@@ -0,0 +1,98 @@
+#include <boost/config.hpp>
+#include <iostream>
+#include <fstream> //file output
+#include <cfloat>
+#include <omp.h>
+#include <boost/graph/graph_traits.hpp>
+#include <boost/graph/adjacency_list.hpp>
+#include <boost/graph/dijkstra_shortest_paths.hpp>
+#include <boost/property_map/property_map.hpp>
+#include <boost/graph/adjacency_list.hpp>
+#include <boost/graph/rmat_graph_generator.hpp>
+#include <boost/random/linear_congruential.hpp>
+#include <boost/graph/graph_traits.hpp>
+ 
+void printUsageAndExit()
+{
+  printf("%s", "Usage:./rmatg x y\n");
+  printf("%s", "x is the size of the graph, x>32 (Boost generator hang if x<32)\n");
+  printf("%s", "y is the source of sssp\n");
+  exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+  // read size
+  if (argc < 3) printUsageAndExit();
+  int size = atoi (argv[1]);
+  if (size<32) printUsageAndExit();
+  int source_sssp =atoi (argv[2]);
+  assert (size > 1 && size < INT_MAX);
+  assert (source_sssp >= 0 && source_sssp < size);
+  const unsigned num_edges = 15 * size;
+  
+  // Some boost types
+  typedef boost::no_property VertexProperty;
+  typedef boost::property<boost::edge_weight_t, float> EdgeProperty;
+  typedef boost::adjacency_list<boost::mapS, boost::vecS, boost::directedS, VertexProperty, EdgeProperty> Graph;
+  typedef boost::unique_rmat_iterator<boost::minstd_rand, Graph> RMATGen;
+  typedef boost::graph_traits<Graph>::vertex_descriptor vertex_descriptor;
+  boost::minstd_rand gen;
+  boost::graph_traits<Graph>::edge_iterator edge, edge_end;
+
+  /************************
+   * Random weights
+   ************************/
+  // !!! WARNING !!!
+  // watch the stack
+  float* weight = new float[num_edges]; 
+  int count = 0;
+  for( int i = 0; i < num_edges;  ++i)
+    weight[i] = (rand()%10)+(rand()%100)*(1.2e-2f);
+
+  /************************
+   * RMAT Gen
+   ************************/
+  Graph g(RMATGen(gen, size, num_edges, 0.57, 0.19, 0.19, 0.05,true),RMATGen(),weight, size);
+  std::cout << "Generator : done. Edges = "<<boost::num_edges(g)<<std::endl; 
+  assert (num_edges == boost::num_edges(g));
+  // debug print after gen
+  //for( boost::tie(edge, edge_end) = boost::edges(g); edge != edge_end; ++edge)
+  //  std::cout << boost::source(*edge, g) << ' ' << boost::target(*edge, g)<< ' '<<  boost::get(boost::get(boost::edge_weight, g),*edge) << '\n';
+  
+  /************************
+   * Dijkstra
+   ************************/
+  std::vector<vertex_descriptor> p(num_vertices(g));
+  std::vector<float> d(num_vertices(g));
+  vertex_descriptor s = vertex(source_sssp, g); //define soruce node
+  
+  double start = omp_get_wtime();
+  dijkstra_shortest_paths(g, s,
+                          predecessor_map(boost::make_iterator_property_map(p.begin(), get(boost::vertex_index, g))).
+                          distance_map(boost::make_iterator_property_map(d.begin(), get(boost::vertex_index, g))));
+
+  double stop = omp_get_wtime();
+  std::cout << "Time = " << stop-start << "s"<< std::endl;
+
+  /************************
+   * Print
+   ************************/
+  /*
+  boost::graph_traits<Graph>::vertex_iterator vi, vend;
+  std::cout << "SOURCE = "<< source_sssp << std::endl; 
+  for (boost::tie(vi, vend) = vertices(g); vi != vend; ++vi) 
+  {
+    if (d[*vi] != FLT_MAX) 
+    {
+      std::cout << "d(" << *vi << ") = " << d[*vi] << ", ";
+      std::cout << "parent = " << p[*vi] << std::endl; 
+    }
+    else
+      std::cout << "d(" << *vi << ") = INF"<< std::endl;
+  }
+  */
+  return 0;
+                
+}
+
diff --git a/cpp/nvgraph/test/run_all_tests.sh b/cpp/nvgraph/test/run_all_tests.sh
new file mode 100755
index 00000000000..83bb80093b5
--- /dev/null
+++ b/cpp/nvgraph/test/run_all_tests.sh
@@ -0,0 +1,16 @@
+#!/bin/sh
+#Usage sh run_all_tests.sh
+#Run all the tests in the current directory (ie. you should copy it in your build/test/ directory).
+test="nvgraph_test
+csrmv_test
+semiring_maxmin_test
+semiring_minplus_test
+semiring_orand_test
+pagerank_test
+sssp_test
+max_flow_test"
+
+for i in $test
+do
+./$i
+done
diff --git a/cpp/src/tests/CMakeLists.txt b/cpp/src/tests/CMakeLists.txt
index 2eac9f8c0f2..eb21cc13d63 100644
--- a/cpp/src/tests/CMakeLists.txt
+++ b/cpp/src/tests/CMakeLists.txt
@@ -34,24 +34,6 @@ else()
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -D_GLIBCXX_USE_CXX11_ABI=0")
 endif(CMAKE_CXX11_ABI)
 
-
-###################################################################################################
-# - add nvgraph -----------------------------------------------------------------------------------
-#if(NOT TARGET NVGRAPH AND NVG_PLUGIN)
-#    find_path(NVGRAPH_INCLUDE "nvgraph"
-#          HINTS "$ENV{NVGRAPH_ROOT}/include"
-#                "$ENV{CONDA_PREFIX}/include")
-#    find_library(NVGRAPH_LIBRARY "nvgraph_st"
-#             HINTS "$ENV{NVGRAPH_ROOT}/lib"
-#                   "$ENV{CONDA_PREFIX}/lib")
-#
-#    add_library( nvgraph SHARED IMPORTED)
-#    if (NVGRAPH_INCLUDE AND NVGRAPH_LIBRARY)
-#        set_target_properties( nvgraph PROPERTIES IMPORTED_LOCATION ${NVGRAPH_LIBRARY})
-#        message(STATUS "nvgraph found in ${NVGRAPH_LIBRARY}")
-#    endif (NVGRAPH_INCLUDE AND NVGRAPH_LIBRARY)
-#endif(NOT TARGET NVGRAPH AND NVG_PLUGIN)
-
 ###################################################################################################
 # - compiler function -----------------------------------------------------------------------------
 function(configure_test TEST_NAME Tests_SRCS)
diff --git a/python/setup.py b/python/setup.py
index e2ade8cd5b0..594119d1a6a 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -28,10 +28,10 @@ def find_in_path(name, path):
 
 def locate_nvgraph():
     if 'CONDA_PREFIX' in os.environ:
-        nvgraph_found = find_in_path('lib/libnvgraph_st.so',
+        nvgraph_found = find_in_path('lib/libnvgraph_rapids.so',
                                      os.environ['CONDA_PREFIX'])
     if nvgraph_found is None:
-        nvgraph_found = find_in_path('libnvgraph_st.so',
+        nvgraph_found = find_in_path('libnvgraph_rapids.so',
                                      os.environ['LD_LIBRARY_PATH'])
         if nvgraph_found is None:
             raise EnvironmentError('The nvgraph library could not be located')
@@ -65,7 +65,7 @@ def locate_nvgraph():
                             '../cpp/build/gunrock/externals/moderngpu/include',
                             '../cpp/build/gunrock/externals/cub'],
               library_dirs=[get_python_lib(), NVGRAPH['lib']],
-              libraries=['cugraph', 'cudf', 'nvgraph_st'],
+              libraries=['cugraph', 'cudf', 'nvgraph_rapids'],
               language='c++',
               extra_compile_args=['-std=c++14'])
 ]